It Starts with a Dataset


In [1]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r')
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r')
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [2]:
reviews[0]


Out[2]:
'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   '

In [3]:
labels[0]


Out[3]:
'POSITIVE'

In [4]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(12816)
pretty_print_review_and_label(6267)
pretty_print_review_and_label(21934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)


labels.txt 	 : 	 reviews.txt

NEGATIVE	:	this movie is terrible but it has some good effects .  ...
POSITIVE	:	adrian pasdar is excellent is this film . he makes a fascinating woman .  ...
NEGATIVE	:	comment this movie is impossible . is terrible  very improbable  bad interpretat...
POSITIVE	:	excellent episode movie ala pulp fiction .  days   suicides . it doesnt get more...
NEGATIVE	:	if you haven  t seen this  it  s terrible . it is pure trash . i saw this about ...
POSITIVE	:	this schiffer guy is a real genius  the movie is of excellent quality and both e...

In [7]:
import numpy as np
from collections import Counter

positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

for i in range(len(reviews)):
    if(labels[i] == 'POSITIVE'):
        for word in reviews[i].split(" "):
            positive_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in reviews[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1
            
pos_neg_ratios = Counter()

for term,cnt in list(total_counts.most_common()):
    if(cnt > 10):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

for word,ratio in pos_neg_ratios.most_common():
    if(ratio > 1):
        pos_neg_ratios[word] = np.log(ratio)
    else:
        pos_neg_ratios[word] = -np.log((1 / (ratio+0.01)))

In [8]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common()


Out[8]:
[('edie', 4.6913478822291435),
 ('antwone', 4.4773368144782069),
 ('din', 4.4067192472642533),
 ('gunga', 4.1896547420264252),
 ('goldsworthy', 4.1743872698956368),
 ('gypo', 4.0943445622221004),
 ('yokai', 4.0943445622221004),
 ('paulie', 4.0775374439057197),
 ('visconti', 3.9318256327243257),
 ('flavia', 3.9318256327243257),
 ('kells', 3.8712010109078911),
 ('blandings', 3.8712010109078911),
 ('brashear', 3.8501476017100584),
 ('gino', 3.8286413964890951),
 ('deathtrap', 3.8066624897703196),
 ('panahi', 3.713572066704308),
 ('harilal', 3.713572066704308),
 ('ossessione', 3.6635616461296463),
 ('tsui', 3.6375861597263857),
 ('caruso', 3.6375861597263857),
 ('sabu', 3.6109179126442243),
 ('ahmad', 3.6109179126442243),
 ('khouri', 3.5835189384561099),
 ('dominick', 3.5835189384561099),
 ('aweigh', 3.5553480614894135),
 ('mj', 3.5553480614894135),
 ('mcintire', 3.5263605246161616),
 ('kriemhild', 3.5263605246161616),
 ('blackie', 3.4965075614664802),
 ('newcombe', 3.4965075614664802),
 ('daisies', 3.4965075614664802),
 ('trelkovsky', 3.4657359027997265),
 ('kei', 3.4657359027997265),
 ('jaffar', 3.4339872044851463),
 ('hilliard', 3.4339872044851463),
 ('gundam', 3.4231762883809305),
 ('sheeta', 3.4011973816621555),
 ('pazu', 3.4011973816621555),
 ('bathsheba', 3.4011973816621555),
 ('krell', 3.4011973816621555),
 ('offside', 3.4011973816621555),
 ('fineman', 3.3672958299864741),
 ('venoms', 3.3672958299864741),
 ('ranma', 3.3322045101752038),
 ('ronny', 3.3322045101752038),
 ('paine', 3.3322045101752038),
 ('pimlico', 3.3322045101752038),
 ('abhay', 3.2958368660043291),
 ('iturbi', 3.2771447329921766),
 ('pym', 3.2580965380214821),
 ('kipling', 3.2580965380214821),
 ('audiard', 3.2188758248682006),
 ('kelso', 3.2188758248682006),
 ('milverton', 3.2188758248682006),
 ('scalise', 3.2188758248682006),
 ('gabe', 3.2188758248682006),
 ('feinstone', 3.1780538303479458),
 ('mukhsin', 3.1780538303479458),
 ('grisby', 3.1780538303479458),
 ('xica', 3.1780538303479458),
 ('moonwalker', 3.1780538303479458),
 ('giovanna', 3.1780538303479458),
 ('felix', 3.1527360223636558),
 ('togar', 3.1354942159291497),
 ('chikatilo', 3.1354942159291497),
 ('heaton', 3.1354942159291497),
 ('jannings', 3.1354942159291497),
 ('luzhin', 3.1135153092103742),
 ('pidgeon', 3.0910424533583161),
 ('matuschek', 3.0910424533583161),
 ('miklos', 3.0910424533583161),
 ('soha', 3.0910424533583161),
 ('fanfan', 3.0910424533583161),
 ('desdemona', 3.0910424533583161),
 ('matador', 3.0910424533583161),
 ('leonora', 3.0910424533583161),
 ('philo', 3.068052935133617),
 ('microfilm', 3.044522437723423),
 ('firemen', 3.044522437723423),
 ('gauri', 3.044522437723423),
 ('lindy', 3.044522437723423),
 ('maradona', 3.044522437723423),
 ('bjm', 3.044522437723423),
 ('joss', 3.044522437723423),
 ('reda', 3.044522437723423),
 ('capote', 3.0122615755052013),
 ('fido', 3.0081547935525483),
 ('mcintyre', 2.9957322735539909),
 ('prote', 2.9957322735539909),
 ('siegfried', 2.9957322735539909),
 ('emory', 2.9957322735539909),
 ('coonskin', 2.9957322735539909),
 ('quibble', 2.9957322735539909),
 ('carrre', 2.9957322735539909),
 ('coe', 2.9957322735539909),
 ('excellently', 2.9789251552376097),
 ('clutter', 2.9704144655697009),
 ('pakeezah', 2.9444389791664403),
 ('ferdie', 2.9444389791664403),
 ('ackland', 2.9444389791664403),
 ('anchors', 2.9444389791664403),
 ('baloo', 2.9444389791664403),
 ('knockout', 2.9444389791664403),
 ('burakov', 2.9444389791664403),
 ('rvd', 2.9444389791664403),
 ('railly', 2.9444389791664403),
 ('schlesinger', 2.9444389791664403),
 ('flippen', 2.9444389791664403),
 ('pinjar', 2.9444389791664403),
 ('digicorp', 2.9444389791664403),
 ('hillyer', 2.9444389791664403),
 ('vance', 2.9444389791664403),
 ('magnus', 2.9444389791664403),
 ('petiot', 2.9444389791664403),
 ('versatility', 2.9444389791664403),
 ('malfique', 2.9444389791664403),
 ('kolchak', 2.9311937524164198),
 ('hayworth', 2.9267394020670396),
 ('deanna', 2.9267394020670396),
 ('iek', 2.8903717578961645),
 ('lando', 2.8903717578961645),
 ('geer', 2.8903717578961645),
 ('burgade', 2.8903717578961645),
 ('falco', 2.8903717578961645),
 ('pollak', 2.8903717578961645),
 ('guerrero', 2.8903717578961645),
 ('hobson', 2.8903717578961645),
 ('pappas', 2.8903717578961645),
 ('volckman', 2.8903717578961645),
 ('hoechlin', 2.8903717578961645),
 ('iphigenia', 2.8903717578961645),
 ('korda', 2.8622008809294686),
 ('sammo', 2.8526314299133175),
 ('orked', 2.8332133440562162),
 ('nighy', 2.8332133440562162),
 ('cdric', 2.8332133440562162),
 ('laputa', 2.8332133440562162),
 ('naudet', 2.8332133440562162),
 ('callahan', 2.8332133440562162),
 ('beckett', 2.8332133440562162),
 ('biko', 2.8332133440562162),
 ('jeon', 2.8332133440562162),
 ('kralik', 2.8332133440562162),
 ('peralta', 2.8332133440562162),
 ('nagra', 2.8332133440562162),
 ('jacknife', 2.8332133440562162),
 ('toughness', 2.8332133440562162),
 ('hewlett', 2.8332133440562162),
 ('sox', 2.8332133440562162),
 ('polanski', 2.8233610476132043),
 ('alvin', 2.8183982582710754),
 ('matthau', 2.8067217286092401),
 ('aiello', 2.8033603809065348),
 ('gaiman', 2.7725887222397811),
 ('endor', 2.7725887222397811),
 ('janos', 2.7725887222397811),
 ('rotj', 2.7725887222397811),
 ('yvaine', 2.7725887222397811),
 ('macready', 2.7725887222397811),
 ('hulce', 2.7725887222397811),
 ('firefighter', 2.7725887222397811),
 ('mathieu', 2.7725887222397811),
 ('delpy', 2.7725887222397811),
 ('coulouris', 2.7725887222397811),
 ('natalia', 2.7725887222397811),
 ('bedknobs', 2.7725887222397811),
 ('bombshells', 2.7725887222397811),
 ('duffell', 2.7725887222397811),
 ('santos', 2.7725887222397811),
 ('antz', 2.7725887222397811),
 ('gackt', 2.7515353130419489),
 ('myrtle', 2.7515353130419489),
 ('adele', 2.7515353130419489),
 ('bake', 2.7408400239252009),
 ('gilliam', 2.7245795030534206),
 ('soutendijk', 2.7080502011022101),
 ('doktor', 2.7080502011022101),
 ('shintaro', 2.7080502011022101),
 ('burman', 2.7080502011022101),
 ('hilda', 2.7080502011022101),
 ('johnnie', 2.7080502011022101),
 ('baton', 2.7080502011022101),
 ('cognac', 2.7080502011022101),
 ('gunbuster', 2.7080502011022101),
 ('silberling', 2.7080502011022101),
 ('ashraf', 2.7080502011022101),
 ('gannon', 2.7080502011022101),
 ('uld', 2.7080502011022101),
 ('lanisha', 2.7080502011022101),
 ('ballantine', 2.7080502011022101),
 ('hickock', 2.7080502011022101),
 ('aviv', 2.7080502011022101),
 ('lian', 2.7080502011022101),
 ('bernsen', 2.7080502011022101),
 ('karas', 2.7080502011022101),
 ('egon', 2.7080502011022101),
 ('parador', 2.7080502011022101),
 ('oro', 2.7080502011022101),
 ('eustache', 2.7080502011022101),
 ('cheh', 2.7080502011022101),
 ('mcanally', 2.7080502011022101),
 ('calamai', 2.7080502011022101),
 ('kiley', 2.7080502011022101),
 ('goines', 2.7080502011022101),
 ('rotoscoped', 2.7080502011022101),
 ('schildkraut', 2.7080502011022101),
 ('grasshoppers', 2.7080502011022101),
 ('valette', 2.7080502011022101),
 ('victoria', 2.6810215287142909),
 ('partition', 2.6741486494265287),
 ('dench', 2.6741486494265287),
 ('conroy', 2.6741486494265287),
 ('beery', 2.6741486494265287),
 ('chavez', 2.6672282065819548),
 ('ratso', 2.653241964607215),
 ('atoz', 2.6390573296152584),
 ('kabei', 2.6390573296152584),
 ('kulkarni', 2.6390573296152584),
 ('rien', 2.6390573296152584),
 ('gardenia', 2.6390573296152584),
 ('emy', 2.6390573296152584),
 ('megs', 2.6390573296152584),
 ('hickam', 2.6390573296152584),
 ('danelia', 2.6390573296152584),
 ('duprez', 2.6390573296152584),
 ('embezzler', 2.6390573296152584),
 ('fetisov', 2.6390573296152584),
 ('treaty', 2.6390573296152584),
 ('scrat', 2.6390573296152584),
 ('laine', 2.6390573296152584),
 ('gialli', 2.6390573296152584),
 ('cb', 2.6390573296152584),
 ('ishwar', 2.6390573296152584),
 ('cartwrights', 2.6390573296152584),
 ('ingram', 2.6390573296152584),
 ('harriet', 2.6390573296152584),
 ('pang', 2.6390573296152584),
 ('melancholic', 2.6390573296152584),
 ('intricately', 2.6390573296152584),
 ('bathhouse', 2.6390573296152584),
 ('pilgrimage', 2.6390573296152584),
 ('tulip', 2.6390573296152584),
 ('beek', 2.6390573296152584),
 ('katsu', 2.6026896854443837),
 ('mildred', 2.6026896854443837),
 ('ultimatum', 2.6026896854443837),
 ('dev', 2.6026896854443837),
 ('fricker', 2.6026896854443837),
 ('emil', 2.6026896854443837),
 ('mclaglen', 2.5649493574615367),
 ('girotti', 2.5649493574615367),
 ('goring', 2.5649493574615367),
 ('guadalcanal', 2.5649493574615367),
 ('oakie', 2.5649493574615367),
 ('broadbent', 2.5649493574615367),
 ('sugiyama', 2.5649493574615367),
 ('tissues', 2.5649493574615367),
 ('luchino', 2.5649493574615367),
 ('nibelungen', 2.5649493574615367),
 ('cynics', 2.5649493574615367),
 ('mcdoakes', 2.5649493574615367),
 ('adjani', 2.5649493574615367),
 ('freebird', 2.5649493574615367),
 ('autograph', 2.5649493574615367),
 ('riget', 2.5649493574615367),
 ('odysseus', 2.5649493574615367),
 ('brownstone', 2.5649493574615367),
 ('choi', 2.5649493574615367),
 ('unsung', 2.5649493574615367),
 ('chavo', 2.5649493574615367),
 ('bahrain', 2.5649493574615367),
 ('holloway', 2.5649493574615367),
 ('sputnik', 2.5649493574615367),
 ('saura', 2.5649493574615367),
 ('boop', 2.5649493574615367),
 ('eglantine', 2.5649493574615367),
 ('gabriella', 2.5649493574615367),
 ('dola', 2.5649493574615367),
 ('erendira', 2.5649493574615367),
 ('bouvier', 2.5649493574615367),
 ('yelnats', 2.5649493574615367),
 ('corbett', 2.5494451709255714),
 ('warhols', 2.5389738710582761),
 ('gandhi', 2.5389738710582761),
 ('sammi', 2.5389738710582761),
 ('abu', 2.5389738710582761),
 ('zu', 2.5389738710582761),
 ('delightfully', 2.5257286443082556),
 ('sirk', 2.5199979695992702),
 ('rosenstrasse', 2.5123056239761148),
 ('creasy', 2.5055259369907361),
 ('braveheart', 2.5014359517392109),
 ('herge', 2.4849066497880004),
 ('barrister', 2.4849066497880004),
 ('santiago', 2.4849066497880004),
 ('cacoyannis', 2.4849066497880004),
 ('blackadder', 2.4849066497880004),
 ('vierde', 2.4849066497880004),
 ('lassalle', 2.4849066497880004),
 ('parminder', 2.4849066497880004),
 ('hayao', 2.4849066497880004),
 ('trenholm', 2.4849066497880004),
 ('bressart', 2.4849066497880004),
 ('natures', 2.4849066497880004),
 ('presque', 2.4849066497880004),
 ('yuzna', 2.4849066497880004),
 ('lafitte', 2.4849066497880004),
 ('mcadam', 2.4849066497880004),
 ('unpretentious', 2.4849066497880004),
 ('hecht', 2.4849066497880004),
 ('perdition', 2.4849066497880004),
 ('gallico', 2.4849066497880004),
 ('holodeck', 2.4849066497880004),
 ('balduin', 2.4849066497880004),
 ('bouzaglo', 2.4849066497880004),
 ('attila', 2.4849066497880004),
 ('mcphillip', 2.4849066497880004),
 ('kazan', 2.4849066497880004),
 ('rideau', 2.4849066497880004),
 ('luger', 2.4849066497880004),
 ('bischoff', 2.4849066497880004),
 ('poonam', 2.4849066497880004),
 ('talos', 2.4849066497880004),
 ('binder', 2.4849066497880004),
 ('euripides', 2.4849066497880004),
 ('killian', 2.4849066497880004),
 ('lupino', 2.4849066497880004),
 ('yeon', 2.4849066497880004),
 ('strindberg', 2.4849066497880004),
 ('hanlon', 2.4849066497880004),
 ('anselmo', 2.4849066497880004),
 ('clutters', 2.4849066497880004),
 ('vonnegut', 2.4638532405901681),
 ('mccoy', 2.456735772821304),
 ('taker', 2.456735772821304),
 ('flawless', 2.451005098112319),
 ('othello', 2.4485390056171252),
 ('natali', 2.4423470353692043),
 ('abbey', 2.4423470353692043),
 ('godmother', 2.4423470353692043),
 ('judi', 2.4423470353692043),
 ('jonestown', 2.4423470353692043),
 ('mahatma', 2.4423470353692043),
 ('mcnally', 2.4423470353692043),
 ('novak', 2.4361164856185682),
 ('durbin', 2.4277482359480516),
 ('christy', 2.4203681286504293),
 ('cheadle', 2.4159137783010487),
 ('unsurpassed', 2.3978952727983707),
 ('hallen', 2.3978952727983707),
 ('rawhide', 2.3978952727983707),
 ('eisenhower', 2.3978952727983707),
 ('faultless', 2.3978952727983707),
 ('wilhelm', 2.3978952727983707),
 ('vadar', 2.3978952727983707),
 ('capano', 2.3978952727983707),
 ('eminent', 2.3978952727983707),
 ('waterman', 2.3978952727983707),
 ('leaud', 2.3978952727983707),
 ('hanka', 2.3978952727983707),
 ('prologues', 2.3978952727983707),
 ('muska', 2.3978952727983707),
 ('bartel', 2.3978952727983707),
 ('showings', 2.3978952727983707),
 ('nord', 2.3978952727983707),
 ('sweetin', 2.3978952727983707),
 ('rf', 2.3978952727983707),
 ('fellowes', 2.3978952727983707),
 ('fuller', 2.3978952727983707),
 ('faust', 2.3978952727983707),
 ('nyqvist', 2.3978952727983707),
 ('arrondissement', 2.3978952727983707),
 ('shep', 2.3978952727983707),
 ('stockwell', 2.3978952727983707),
 ('radiant', 2.3978952727983707),
 ('dragoon', 2.3978952727983707),
 ('starewicz', 2.3978952727983707),
 ('tetsur', 2.3978952727983707),
 ('ramones', 2.3978952727983707),
 ('cannavale', 2.3978952727983707),
 ('taoist', 2.3978952727983707),
 ('filone', 2.3978952727983707),
 ('dorfman', 2.3978952727983707),
 ('gracia', 2.3978952727983707),
 ('bruhl', 2.3978952727983707),
 ('pei', 2.3978952727983707),
 ('stitzer', 2.3978952727983707),
 ('gorris', 2.3978952727983707),
 ('regent', 2.3978952727983707),
 ('gundams', 2.3978952727983707),
 ('antonietta', 2.3978952727983707),
 ('seine', 2.3978952727983707),
 ('lok', 2.3978952727983707),
 ('huns', 2.3978952727983707),
 ('nandini', 2.3978952727983707),
 ('bolan', 2.3978952727983707),
 ('zp', 2.3978952727983707),
 ('thursby', 2.3978952727983707),
 ('bazza', 2.3978952727983707),
 ('woronov', 2.3978952727983707),
 ('tykwer', 2.3978952727983707),
 ('pike', 2.3978952727983707),
 ('sjoman', 2.3978952727983707),
 ('dorsey', 2.3978952727983707),
 ('cookbook', 2.3978952727983707),
 ('noriko', 2.3978952727983707),
 ('viennese', 2.3978952727983707),
 ('soapdish', 2.3978952727983707),
 ('fanshawe', 2.3978952727983707),
 ('spacecamp', 2.3978952727983707),
 ('atul', 2.3978952727983707),
 ('hoss', 2.3978952727983707),
 ('yasmin', 2.3978952727983707),
 ('pita', 2.367123614131617),
 ('cal', 2.3608540011180215),
 ('reservations', 2.3513752571634776),
 ('elsa', 2.3513752571634776),
 ('schmid', 2.3513752571634776),
 ('oberon', 2.3513752571634776),
 ('johnston', 2.3513752571634776),
 ('marylee', 2.3513752571634776),
 ('brisson', 2.3513752571634776),
 ('winchester', 2.3353749158170367),
 ('jabba', 2.3272777055844172),
 ('chamberlain', 2.3191143949452564),
 ('jud', 2.3025850929940459),
 ('montrose', 2.3025850929940459),
 ('coop', 2.3025850929940459),
 ('bannister', 2.3025850929940459),
 ('homeward', 2.3025850929940459),
 ('hundstage', 2.3025850929940459),
 ('manny', 2.3025850929940459),
 ('colman', 2.3025850929940459),
 ('tigerland', 2.3025850929940459),
 ('ungar', 2.3025850929940459),
 ('girlfight', 2.3025850929940459),
 ('haines', 2.3025850929940459),
 ('rea', 2.3025850929940459),
 ('flamenco', 2.3025850929940459),
 ('carla', 2.2772672850097559),
 ('hanzo', 2.2772672850097559),
 ('fagin', 2.2772672850097559),
 ('sullavan', 2.2686835413183641),
 ('aunts', 2.2686835413183641),
 ('olympia', 2.2686835413183641),
 ('sabrina', 2.2643638801738479),
 ('superbly', 2.2600254785752498),
 ('linklater', 2.2512917986064953),
 ('elia', 2.2512917986064953),
 ('beetle', 2.2512917986064953),
 ('mccartney', 2.2512917986064953),
 ('tully', 2.2512917986064953),
 ('hickok', 2.2512917986064953),
 ('peters', 2.2512917986064953),
 ('sweetly', 2.2512917986064953),
 ('aborigines', 2.2512917986064953),
 ('zentropa', 2.2512917986064953),
 ('gigi', 2.2512917986064953),
 ('northam', 2.2407096892759584),
 ('tomlinson', 2.2335922215070942),
 ('tenant', 2.2284771208403238),
 ('influential', 2.224623551524334),
 ('kalifornia', 2.224623551524334),
 ('stardust', 2.2192034840549946),
 ('kinnear', 2.2155737160044158),
 ('quartier', 2.1972245773362196),
 ('complement', 2.1972245773362196),
 ('hatcher', 2.1972245773362196),
 ('raoul', 2.1972245773362196),
 ('squire', 2.1972245773362196),
 ('vertigo', 2.1972245773362196),
 ('treasured', 2.1972245773362196),
 ('benet', 2.1972245773362196),
 ('magnificently', 2.1972245773362196),
 ('raines', 2.1972245773362196),
 ('finely', 2.1690537003695232),
 ('cheung', 2.1690537003695232),
 ('stevenson', 2.1690537003695232),
 ('georges', 2.1649637151179979),
 ('perfection', 2.1594842493533721),
 ('weir', 2.1594842493533721),
 ('iris', 2.1594842493533721),
 ('marvelously', 2.1594842493533721),
 ('rukh', 2.1594842493533721),
 ('enchanting', 2.1517622032594619),
 ('sailing', 2.1400661634962708),
 ('kennel', 2.1400661634962708),
 ('brodie', 2.1400661634962708),
 ('dixon', 2.1400661634962708),
 ('anand', 2.1400661634962708),
 ('speakeasy', 2.1400661634962708),
 ('celine', 2.1400661634962708),
 ('province', 2.1400661634962708),
 ('astaire', 2.1400661634962708),
 ('pressburger', 2.1400661634962708),
 ('spellbinding', 2.1400661634962708),
 ('leung', 2.1400661634962708),
 ('mahoney', 2.1400661634962708),
 ('curr', 2.1400661634962708),
 ('cartwright', 2.1400661634962708),
 ('trier', 2.1316272948504063),
 ('pecker', 2.1282317058492679),
 ('wodehouse', 2.120263536200091),
 ('miyazaki', 2.120263536200091),
 ('vulnerability', 2.1102132003465894),
 ('darius', 2.1102132003465894),
 ('marjorie', 2.1102132003465894),
 ('hark', 2.1102132003465894),
 ('devotion', 2.1041341542702074),
 ('loomis', 2.0794415416798357),
 ('tackled', 2.0794415416798357),
 ('mores', 2.0794415416798357),
 ('fez', 2.0794415416798357),
 ('benoit', 2.0794415416798357),
 ('braun', 2.0794415416798357),
 ('devos', 2.0794415416798357),
 ('iago', 2.0794415416798357),
 ('boothe', 2.0794415416798357),
 ('romy', 2.0794415416798357),
 ('janeway', 2.0794415416798357),
 ('footlight', 2.0794415416798357),
 ('yang', 2.0794415416798357),
 ('abbot', 2.0794415416798357),
 ('genesis', 2.0794415416798357),
 ('malone', 2.0794415416798357),
 ('duryea', 2.0794415416798357),
 ('lupin', 2.0794415416798357),
 ('cheech', 2.0541237336955462),
 ('mol', 2.0476928433652555),
 ('ella', 2.0476928433652555),
 ('tel', 2.0476928433652555),
 ('askey', 2.0476928433652555),
 ('fairytale', 2.0476928433652555),
 ('captures', 2.0386195471595809),
 ('tintin', 2.0368819272610401),
 ('curtiz', 2.0368819272610401),
 ('duchess', 2.0368819272610401),
 ('quotable', 2.0368819272610401),
 ('edith', 2.0368819272610401),
 ('pickford', 2.0314323224934752),
 ('scoop', 2.0314323224934752),
 ('voight', 2.0301704926730531),
 ('wonderfully', 2.0218960560332353),
 ('patekar', 2.0149030205422647),
 ('richness', 2.0149030205422647),
 ('ephemeral', 2.0149030205422647),
 ('blaine', 2.0149030205422647),
 ('donnell', 2.0149030205422647),
 ('accomplishes', 2.0149030205422647),
 ('archibald', 2.0149030205422647),
 ('stalkers', 2.0149030205422647),
 ('dearly', 2.0149030205422647),
 ('gilley', 2.0149030205422647),
 ('kessler', 2.0149030205422647),
 ('silvio', 2.0149030205422647),
 ('goa', 2.0149030205422647),
 ('saffron', 2.0149030205422647),
 ('brokedown', 2.0149030205422647),
 ('sasha', 2.0149030205422647),
 ('broomsticks', 2.0149030205422647),
 ('milton', 2.0149030205422647),
 ('tetsuo', 2.0149030205422647),
 ('laird', 2.0149030205422647),
 ('stubby', 2.0149030205422647),
 ('zorro', 2.0053335695261141),
 ('clarence', 2.0014800002101243),
 ('dandy', 1.9980959022258835),
 ('bozz', 1.9924301646902061),
 ('pickpocket', 1.9924301646902061),
 ('pawn', 1.9924301646902061),
 ('frailty', 1.9924301646902061),
 ('egan', 1.9924301646902061),
 ('heartbreaking', 1.9924301646902061),
 ('venezuela', 1.9810014688665833),
 ('powell', 1.9783454248084671),
 ('wang', 1.9740810260220096),
 ('brosnan', 1.9547990964725592),
 ('refreshingly', 1.9459101490553132),
 ('hawke', 1.9459101490553132),
 ('aames', 1.9459101490553132),
 ('collinwood', 1.9459101490553132),
 ('henderson', 1.9459101490553132),
 ('meena', 1.9459101490553132),
 ('preity', 1.9459101490553132),
 ('truman', 1.9459101490553132),
 ('argentine', 1.9459101490553132),
 ('trebor', 1.9459101490553132),
 ('carre', 1.9459101490553132),
 ('ida', 1.9459101490553132),
 ('enthralling', 1.9459101490553132),
 ('noll', 1.9459101490553132),
 ('wegener', 1.9459101490553132),
 ('mala', 1.9459101490553132),
 ('francois', 1.9459101490553132),
 ('transcends', 1.9459101490553132),
 ('wtc', 1.9459101490553132),
 ('cratchit', 1.9459101490553132),
 ('singin', 1.9459101490553132),
 ('gingold', 1.9459101490553132),
 ('rathbone', 1.9459101490553132),
 ('strathairn', 1.9459101490553132),
 ('stairway', 1.9459101490553132),
 ('bafta', 1.9459101490553132),
 ('masterful', 1.9459101490553132),
 ('slipper', 1.9459101490553132),
 ('oshii', 1.9459101490553132),
 ('mostel', 1.9459101490553132),
 ('sg', 1.927891643552635),
 ('masterson', 1.927891643552635),
 ('sunrise', 1.9252908618525775),
 ('lily', 1.9203768470501485),
 ('corbin', 1.9169226121820611),
 ('lila', 1.9169226121820611),
 ('turturro', 1.9095425048844386),
 ('adelaide', 1.9095425048844386),
 ('expertly', 1.9095425048844386),
 ('lighten', 1.9095425048844386),
 ('bakshi', 1.9029851043382795),
 ('lincoln', 1.9014583864844796),
 ('pabst', 1.8971199848858813),
 ('exquisitely', 1.8971199848858813),
 ('sumptuous', 1.8971199848858813),
 ('firefighters', 1.8971199848858813),
 ('skagway', 1.8971199848858813),
 ('finlay', 1.8971199848858813),
 ('europa', 1.8971199848858813),
 ('bhandarkar', 1.8971199848858813),
 ('darren', 1.8925641683500207),
 ('chang', 1.8870696490323797),
 ('booker', 1.8870696490323797),
 ('tomei', 1.8827312474337816),
 ('flik', 1.8718021769015913),
 ('iberia', 1.8718021769015913),
 ('kumari', 1.8718021769015913),
 ('maslin', 1.8718021769015913),
 ('lyman', 1.8718021769015913),
 ('lucienne', 1.8718021769015913),
 ('ozzie', 1.8718021769015913),
 ('lucile', 1.8718021769015913),
 ('cassavettes', 1.8718021769015913),
 ('ronda', 1.8718021769015913),
 ('cypher', 1.8718021769015913),
 ('borowczyk', 1.8718021769015913),
 ('labyrinth', 1.8718021769015913),
 ('blindness', 1.8718021769015913),
 ('informer', 1.8718021769015913),
 ('schygulla', 1.8718021769015913),
 ('butterflies', 1.8718021769015913),
 ('paget', 1.8718021769015913),
 ('collaborators', 1.8718021769015913),
 ('coccio', 1.8718021769015913),
 ('superlative', 1.8718021769015913),
 ('aborigine', 1.8718021769015913),
 ('rolle', 1.8718021769015913),
 ('impressionist', 1.8718021769015913),
 ('ellie', 1.8718021769015913),
 ('kher', 1.8718021769015913),
 ('holt', 1.8718021769015913),
 ('solicitor', 1.8718021769015913),
 ('bettany', 1.8718021769015913),
 ('bulimia', 1.8718021769015913),
 ('busby', 1.8718021769015913),
 ('stealer', 1.8718021769015913),
 ('spellbound', 1.8718021769015913),
 ('observant', 1.8718021769015913),
 ('greene', 1.8607523407150064),
 ('carell', 1.8588987720656835),
 ('sematary', 1.8562979903656263),
 ('refreshing', 1.8551812956655511),
 ('montana', 1.8538912503350613),
 ('pegg', 1.8523840910444898),
 ('breathtaking', 1.8481124057791867),
 ('bourne', 1.8478489358790986),
 ('siu', 1.8458266904983307),
 ('prix', 1.8458266904983307),
 ('teri', 1.8458266904983307),
 ('lemmon', 1.8458266904983307),
 ('splendidly', 1.8458266904983307),
 ('twisty', 1.8458266904983307),
 ('uncompromising', 1.8458266904983307),
 ('hardwicke', 1.8458266904983307),
 ('electrifying', 1.8458266904983307),
 ('mendes', 1.8325814637483102),
 ('morbius', 1.8325814637483102),
 ('guiness', 1.8325814637483102),
 ('brock', 1.8325814637483102),
 ('walsh', 1.8325814637483102),
 ('abby', 1.8325814637483102),
 ('zelda', 1.8325814637483102),
 ('zabriskie', 1.824549292051046),
 ('connolly', 1.824549292051046),
 ('keeler', 1.824549292051046),
 ('carface', 1.824549292051046),
 ('batwoman', 1.824549292051046),
 ('vincenzo', 1.8191584434161694),
 ('precise', 1.8152899666382492),
 ('parrot', 1.8152899666382492),
 ('explores', 1.8082887711792655),
 ('steele', 1.8082887711792655),
 ('delightful', 1.8002701588959635),
 ('flynn', 1.7996646487351682),
 ('rafael', 1.791759469228055),
 ('franks', 1.791759469228055),
 ('celebrates', 1.791759469228055),
 ('corsaut', 1.791759469228055),
 ('conor', 1.791759469228055),
 ('breathes', 1.791759469228055),
 ('geddes', 1.791759469228055),
 ('enforced', 1.791759469228055),
 ('perseverance', 1.791759469228055),
 ('colonialism', 1.791759469228055),
 ('demunn', 1.791759469228055),
 ('advent', 1.791759469228055),
 ('tumultuous', 1.791759469228055),
 ('jong', 1.791759469228055),
 ('leia', 1.791759469228055),
 ('auer', 1.791759469228055),
 ('strangler', 1.791759469228055),
 ('culp', 1.791759469228055),
 ('weismuller', 1.791759469228055),
 ('patric', 1.791759469228055),
 ('uproarious', 1.791759469228055),
 ('nan', 1.791759469228055),
 ('talespin', 1.791759469228055),
 ('shemp', 1.791759469228055),
 ('linden', 1.791759469228055),
 ('mischa', 1.791759469228055),
 ('nicolai', 1.791759469228055),
 ('marcus', 1.791759469228055),
 ('examines', 1.791759469228055),
 ('parisian', 1.791759469228055),
 ('runyon', 1.791759469228055),
 ('witherspoon', 1.791759469228055),
 ('bearer', 1.791759469228055),
 ('champions', 1.791759469228055),
 ('robust', 1.791759469228055),
 ('dedlock', 1.791759469228055),
 ('evans', 1.791759469228055),
 ('britton', 1.791759469228055),
 ('zandalee', 1.791759469228055),
 ('rogen', 1.791759469228055),
 ('lumire', 1.791759469228055),
 ('heightens', 1.791759469228055),
 ('crouse', 1.791759469228055),
 ('travers', 1.791759469228055),
 ('raja', 1.791759469228055),
 ('prem', 1.791759469228055),
 ('trejo', 1.791759469228055),
 ('nath', 1.791759469228055),
 ('massey', 1.791759469228055),
 ('tadashi', 1.791759469228055),
 ('strides', 1.791759469228055),
 ('trotta', 1.791759469228055),
 ('mower', 1.791759469228055),
 ('leisen', 1.791759469228055),
 ('undying', 1.791759469228055),
 ('rory', 1.791759469228055),
 ('chess', 1.7797832781813394),
 ('andrews', 1.7764919970972666),
 ('homer', 1.7692866133759964),
 ('apartheid', 1.7635885922613586),
 ('beautifully', 1.7626953362841438),
 ('foch', 1.7578579175523736),
 ('soccer', 1.7578579175523736),
 ('friendships', 1.7578579175523736),
 ('maclean', 1.7578579175523736),
 ('ira', 1.7491998548092591),
 ('deliciously', 1.7491998548092591),
 ('reginald', 1.7491998548092591),
 ('miners', 1.7491998548092591),
 ('todesking', 1.7491998548092591),
 ('lumet', 1.7462970951512977),
 ('affection', 1.7452394535931621),
 ('cedric', 1.742969305058623),
 ('bittersweet', 1.742969305058623),
 ('elvira', 1.7397031072720019),
 ('carrell', 1.7346010553881064),
 ('silhouette', 1.7346010553881064),
 ('radium', 1.7346010553881064),
 ('custer', 1.7346010553881064),
 ('caprice', 1.7346010553881064),
 ('stepsisters', 1.7346010553881064),
 ('bureaucracy', 1.7346010553881064),
 ('shefali', 1.7346010553881064),
 ('kovacs', 1.7346010553881064),
 ('ilona', 1.7346010553881064),
 ('provo', 1.7346010553881064),
 ('hoon', 1.7346010553881064),
 ('dell', 1.7346010553881064),
 ('ullman', 1.7346010553881064),
 ('axel', 1.7346010553881064),
 ('deft', 1.7346010553881064),
 ('vulcan', 1.7346010553881064),
 ('entranced', 1.7346010553881064),
 ('scorpion', 1.7346010553881064),
 ('kidman', 1.729239112246721),
 ('paperhouse', 1.7227665977411035),
 ('underrated', 1.7197859696029656),
 ('sopranos', 1.7197859696029656),
 ('myrna', 1.7176514970743331),
 ('quintessential', 1.7176514970743331),
 ('gripping', 1.7165360479904674),
 ('superb', 1.7091514458966952),
 ('mastery', 1.7047480922384253),
 ('kibbee', 1.7047480922384253),
 ('borden', 1.7047480922384253),
 ('pension', 1.7047480922384253),
 ('partnership', 1.7047480922384253),
 ('extravagant', 1.7047480922384253),
 ('sternberg', 1.7047480922384253),
 ('montand', 1.7047480922384253),
 ('perceptions', 1.7047480922384253),
 ('minton', 1.7047480922384253),
 ('expansion', 1.7047480922384253),
 ('rail', 1.7047480922384253),
 ('albuquerque', 1.7047480922384253),
 ('coveted', 1.7047480922384253),
 ('celeste', 1.7047480922384253),
 ('lassick', 1.7047480922384253),
 ('apollonia', 1.7047480922384253),
 ('rippner', 1.7047480922384253),
 ('poirot', 1.7047480922384253),
 ('birdie', 1.7047480922384253),
 ('eduardo', 1.7047480922384253),
 ('gorshin', 1.7047480922384253),
 ('friel', 1.7047480922384253),
 ('expressionistic', 1.7047480922384253),
 ('nunsploitation', 1.7047480922384253),
 ('connecticut', 1.7047480922384253),
 ('buttgereit', 1.7047480922384253),
 ('mavens', 1.7047480922384253),
 ('civilized', 1.7047480922384253),
 ('nina', 1.7047480922384253),
 ('rediscovered', 1.7047480922384253),
 ('moonstruck', 1.7047480922384253),
 ('dukakis', 1.7047480922384253),
 ('snare', 1.7047480922384253),
 ('warms', 1.7047480922384253),
 ('gallows', 1.7047480922384253),
 ('doolittle', 1.7047480922384253),
 ('criterion', 1.7047480922384253),
 ('dickinson', 1.7047480922384253),
 ('delon', 1.7047480922384253),
 ('cameroon', 1.7047480922384253),
 ('han', 1.6916760106710724),
 ('ealing', 1.6916760106710724),
 ('paula', 1.6863989535702288),
 ('yoda', 1.6863989535702288),
 ('holm', 1.6863989535702288),
 ('deliverance', 1.6863989535702288),
 ('weaves', 1.6863989535702288),
 ('bagdad', 1.6863989535702288),
 ('determination', 1.6817585740137264),
 ('muller', 1.6739764335716716),
 ('crashers', 1.6739764335716716),
 ('romanticized', 1.6739764335716716),
 ('schmidt', 1.6739764335716716),
 ('petition', 1.6739764335716716),
 ('jerome', 1.6739764335716716),
 ('doodlebops', 1.6739764335716716),
 ('bulldog', 1.6739764335716716),
 ('mvp', 1.6739764335716716),
 ('textile', 1.6739764335716716),
 ('scola', 1.6739764335716716),
 ('tierney', 1.6739764335716716),
 ('janice', 1.6739764335716716),
 ('sceptical', 1.6739764335716716),
 ('krabbe', 1.6739764335716716),
 ('caleb', 1.6739764335716716),
 ('delight', 1.6714733033535532),
 ('welles', 1.6677068205580761),
 ('reeve', 1.6677068205580761),
 ('zelah', 1.6650077635889111),
 ('sadness', 1.663505133704376),
 ('accustomed', 1.6582280766035324),
 ('shia', 1.6582280766035324),
 ('hermann', 1.6582280766035324),
 ('palsy', 1.6582280766035324),
 ('meatball', 1.6582280766035324),
 ('proposes', 1.6582280766035324),
 ('technicolor', 1.65455834771457),
 ('ae', 1.6529230243738393),
 ('nicky', 1.6486586255873816),
 ('soylent', 1.6486586255873816),
 ('restoration', 1.6486586255873816),
 ('tenderness', 1.6486586255873816),
 ('maintained', 1.6486586255873816),
 ('joyous', 1.6486586255873816),
 ('kline', 1.6422277352570913),
 ('sinatra', 1.6389967146756448),
 ('touching', 1.637217476541176),
 ('marisa', 1.634130525024472),
 ('stadium', 1.634130525024472),
 ('gershwin', 1.6314168191528755),
 ('timeless', 1.62924053973028),
 ('macy', 1.6211339521972916),
 ('unforgettable', 1.6177367152487956),
 ('favorites', 1.6158688027643908),
 ('stewart', 1.6119987332957739),
 ('grayson', 1.6094379124341003),
 ('shanks', 1.6094379124341003),
 ('airwolf', 1.6094379124341003),
 ('congrats', 1.6094379124341003),
 ('mammoth', 1.6094379124341003),
 ('henri', 1.6094379124341003),
 ('mammy', 1.6094379124341003),
 ('kabal', 1.6094379124341003),
 ('weber', 1.6094379124341003),
 ('prelude', 1.6094379124341003),
 ('taka', 1.6094379124341003),
 ('cruz', 1.6094379124341003),
 ('cocktails', 1.6094379124341003),
 ('judson', 1.6094379124341003),
 ('blier', 1.6094379124341003),
 ('enforcer', 1.6094379124341003),
 ('roberta', 1.6094379124341003),
 ('pendleton', 1.6094379124341003),
 ('internationally', 1.6094379124341003),
 ('jonny', 1.6094379124341003),
 ('taft', 1.6094379124341003),
 ('funhouse', 1.6094379124341003),
 ('monarchy', 1.6094379124341003),
 ('roshan', 1.6094379124341003),
 ('panda', 1.6094379124341003),
 ('patten', 1.6094379124341003),
 ('restrictive', 1.6094379124341003),
 ('compliments', 1.6094379124341003),
 ('anansa', 1.6094379124341003),
 ('duc', 1.6094379124341003),
 ('florinda', 1.6094379124341003),
 ('franchot', 1.6094379124341003),
 ('hartley', 1.6094379124341003),
 ('candid', 1.6094379124341003),
 ('breakdancing', 1.6094379124341003),
 ('sorbonne', 1.6094379124341003),
 ('noire', 1.6094379124341003),
 ('hoodlums', 1.6094379124341003),
 ('sullivan', 1.6094379124341003),
 ('perceptive', 1.6094379124341003),
 ('serrault', 1.6094379124341003),
 ('bloch', 1.6094379124341003),
 ('extraordinary', 1.6094379124341003),
 ('retriever', 1.6094379124341003),
 ('considerations', 1.6094379124341003),
 ('ringo', 1.6094379124341003),
 ('hahk', 1.6094379124341003),
 ('zenith', 1.6094379124341003),
 ('outstandingly', 1.6094379124341003),
 ('orphaned', 1.6094379124341003),
 ('dahlia', 1.6094379124341003),
 ('ponderosa', 1.6094379124341003),
 ('humanism', 1.6094379124341003),
 ('antidote', 1.6094379124341003),
 ('rugged', 1.6094379124341003),
 ('synthesis', 1.6094379124341003),
 ('lanchester', 1.6094379124341003),
 ('paxinou', 1.6094379124341003),
 ('tsing', 1.6094379124341003),
 ('competitor', 1.6094379124341003),
 ('summertime', 1.6094379124341003),
 ('duets', 1.6094379124341003),
 ('mcgrath', 1.6094379124341003),
 ('repulsion', 1.6094379124341003),
 ('eytan', 1.6094379124341003),
 ('grasshopper', 1.6094379124341003),
 ('everytown', 1.6094379124341003),
 ('hedy', 1.6094379124341003),
 ('priorities', 1.6094379124341003),
 ('kurosawa', 1.6094379124341003),
 ('chico', 1.6094379124341003),
 ('meloni', 1.6094379124341003),
 ('moulin', 1.6094379124341003),
 ('glacier', 1.6094379124341003),
 ('regency', 1.6094379124341003),
 ('advancing', 1.6094379124341003),
 ('complexities', 1.6094379124341003),
 ('unavailable', 1.6094379124341003),
 ('wai', 1.6094379124341003),
 ('nunez', 1.6094379124341003),
 ('brilliantly', 1.5950491749820008),
 ('einstein', 1.5910887737659039),
 ('liu', 1.5910887737659039),
 ('clara', 1.5910887737659039),
 ('dustin', 1.589235205116581),
 ('iran', 1.5841201044498106),
 ('conductor', 1.5841201044498106),
 ('shanghai', 1.5804503755608481),
 ('rainer', 1.575536360758419),
 ('alienate', 1.575536360758419),
 ('mesmerizing', 1.5723966407537513),
 ('raul', 1.5686159179138452),
 ('friendship', 1.5677652160335325),
 ('wonderful', 1.5645425925262093),
 ('sergeants', 1.5581446180465499),
 ('layered', 1.5581446180465499),
 ('corinne', 1.5581446180465499),
 ('seamlessly', 1.5581446180465499),
 ('demme', 1.5581446180465499),
 ('moriarty', 1.5581446180465499),
 ('trading', 1.5581446180465499),
 ...]

In [9]:
# words most frequently seen in a review with a "NEGATIVE" label
list(reversed(pos_neg_ratios.most_common()))[0:30]


Out[9]:
[('nisha', -4.6051701859880918),
 ('slater', -4.6051701859880918),
 ('ramtha', -4.6051701859880918),
 ('eod', -4.6051701859880918),
 ('dunaway', -4.6051701859880918),
 ('weisz', -4.6051701859880918),
 ('mckenna', -4.6051701859880918),
 ('zenia', -4.6051701859880918),
 ('tashan', -4.6051701859880918),
 ('dushku', -4.6051701859880918),
 ('gymkata', -4.6051701859880918),
 ('horrorfest', -4.6051701859880918),
 ('lordi', -4.6051701859880918),
 ('caulfield', -4.6051701859880918),
 ('nepotism', -4.6051701859880918),
 ('swinton', -4.6051701859880918),
 ('schwartzman', -4.6051701859880918),
 ('mraovich', -4.6051701859880918),
 ('carnosaur', -4.6051701859880918),
 ('strummer', -4.6051701859880918),
 ('gammera', -4.6051701859880918),
 ('balding', -4.6051701859880918),
 ('borel', -4.6051701859880918),
 ('richie', -4.6051701859880918),
 ('mohanlal', -4.6051701859880918),
 ('brinke', -4.6051701859880918),
 ('reb', -4.6051701859880918),
 ('pharaoh', -4.6051701859880918),
 ('cato', -4.6051701859880918),
 ('mallory', -4.6051701859880918)]

Transforming Text to Numbers

Example Predictions


In [10]:
from IPython.display import Image

review = "This was a horrible, terrible movie."

Image(filename='sentiment_network.png')


Out[10]:

In [11]:
review = "The movie was excellent"

Image(filename='sentiment_network_pos.png')


Out[11]:

Creating the Input Data


In [12]:
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print(vocab_size)


74074

And now we can initialize our (empty) input layer as vector of 0s. We'll modify it later by putting "1"s in various positions.


In [13]:
import numpy as np

layer_0 = np.zeros((1,vocab_size))
layer_0


Out[13]:
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

And now we want to create a function that will set our layer_0 list to the correct sequence of 1s and 0s based on a single review. Now if you remember our picture before, you might have noticed something. Each word had a specific place in the input of our network.


In [14]:
from IPython.display import Image
Image(filename='sentiment_network.png')


Out[14]:

In order to create a function that can update our layer_0 variable based on a review, we have to decide which spots in our layer_0 vector (list of numbers) correlate with each word. Truth be told, it doesn't matter which ones we choose, only that we pick spots for each word and stick with them. Let's decide those positions now and store them in a python dictionary called "word2index".


In [15]:
word2index = {}

for i,word in enumerate(vocab):
    word2index[word] = i
word2index


Out[15]:
{'': 0,
 'muto': 1,
 'tobei': 2,
 'lacky': 3,
 'balalaika': 8,
 'noltie': 43833,
 'psicoanalitical': 6,
 'studliest': 7,
 'handkerchief': 62643,
 'aronofsky': 9,
 'housework': 10,
 'cavepeople': 11,
 'beachwear': 61755,
 'eliminate': 60026,
 'required': 48036,
 'marginalize': 61757,
 'blasted': 12,
 'peruvian': 14,
 'lokis': 24547,
 'cassady': 15,
 'gringos': 17,
 'singles': 18,
 'bomber': 19,
 'overplay': 20,
 'vampiros': 21,
 'slobby': 22,
 'loonies': 27,
 'chockful': 24,
 'leatherface': 26,
 'appearences': 28,
 'teens': 29,
 'excelling': 30,
 'bodied': 5,
 'clouds': 33,
 'porter': 59476,
 'crudeness': 34,
 'loerrta': 37,
 'posehn': 36,
 'sobriety': 38,
 'irit': 39,
 'extremiously': 41,
 'convert': 42,
 'throwaways': 51770,
 'kinski': 43,
 'liman': 44,
 'undeniable': 54183,
 'suberb': 45,
 'personl': 46,
 'kabuliwallah': 61762,
 'gammon': 48,
 'striking': 49,
 'former': 50,
 'boast': 51,
 'retooled': 24552,
 'bequest': 52,
 'janosch': 53,
 'sayeth': 54,
 'flaps': 56,
 'phantasmogoric': 37140,
 'baboon': 58,
 'minuted': 59,
 'upon': 60,
 'holocausts': 61,
 'forgive': 61767,
 'scarecreow': 65,
 'applications': 63,
 'lada': 64,
 'romolo': 67,
 'machiavellian': 24556,
 'graduates': 68,
 'snoopy': 69,
 'piotr': 70,
 'planche': 61769,
 'greebling': 49482,
 'overrules': 71,
 'choo': 69678,
 'flops': 24557,
 'incompetente': 72,
 'submissiveness': 73,
 'carpethia': 50005,
 'odlly': 24558,
 'nocked': 77,
 'utilitarian': 76,
 'hulking': 78,
 'ahhhhhhhhh': 80,
 'doreen': 83,
 'walker': 82,
 'raft': 45812,
 'encountered': 37145,
 'prescribes': 84,
 'collera': 61774,
 'vacano': 12413,
 'suposed': 85,
 'onetime': 86,
 'leese': 87,
 'charictor': 88,
 'ranna': 49484,
 'sjunde': 30420,
 'spanishness': 13,
 'manana': 91,
 'inertia': 61775,
 'berrymore': 93,
 'punter': 65913,
 'eragon': 94,
 'reasserted': 96,
 'winona': 97,
 'came': 98,
 'benumbed': 103,
 'cowriter': 100,
 'sideand': 106,
 'contribution': 102,
 'stilts': 105,
 'rodolfo': 107,
 'blaring': 24567,
 'shonen': 109,
 'sayori': 110,
 'huzzah': 111,
 'hollin': 115,
 'corkscrew': 31429,
 'specialize': 114,
 'decoration': 16,
 'viewmaster': 117,
 'belying': 119,
 'mermaids': 120,
 'zecchino': 122,
 'rumors': 61782,
 'zira': 123,
 'exaggerations': 124,
 'verna': 125,
 'badd': 126,
 'counselor': 127,
 'minded': 43373,
 'bekker': 128,
 'nadanova': 130,
 'topor': 61785,
 'showy': 131,
 'dorkily': 25,
 'michelle': 132,
 'terrytoons': 36526,
 'carrefour': 133,
 'defalting': 134,
 'bemoaning': 71367,
 'hallucinogenics': 135,
 'molester': 65936,
 'overpopulated': 136,
 'placesyou': 141,
 'dodes': 138,
 'vampyres': 139,
 'fledgling': 140,
 'nobudget': 142,
 'aetv': 143,
 'variations': 23,
 'abstracted': 145,
 'interval': 146,
 'codfish': 147,
 'tuxedoed': 149,
 'oscillators': 150,
 'presidents': 151,
 'sororities': 152,
 'congressman': 153,
 'childs': 154,
 'generification': 156,
 'financier': 49495,
 'loll': 65377,
 'gaffers': 58169,
 'piznarski': 157,
 'tidying': 158,
 'collins': 159,
 'pineapples': 160,
 'gift': 161,
 'cultivate': 37156,
 'guilty': 162,
 'luster': 24582,
 'picasso': 163,
 'savelyeva': 49497,
 'dislodge': 164,
 'redeaming': 31,
 'replaces': 61791,
 'junior': 167,
 'propositioned': 168,
 'weybridge': 169,
 'raptor': 170,
 'overhaul': 171,
 'photowise': 32,
 'transcribing': 172,
 'irvine': 173,
 'honourable': 64074,
 'swampy': 174,
 'greensward': 175,
 'female': 176,
 'dithered': 179,
 'grab': 178,
 'surpassing': 70932,
 'refreshingly': 180,
 'defect': 181,
 'vanlint': 182,
 'pitt': 183,
 'celebei': 184,
 'boiling': 35,
 'lorado': 53890,
 'unguarded': 186,
 'pessimist': 189,
 'steepest': 188,
 'haunts': 63674,
 'griffths': 190,
 'number': 191,
 'ailing': 192,
 'begets': 193,
 'holocaust': 45160,
 'existence': 194,
 'blatty': 195,
 'loch': 196,
 'vandermey': 197,
 'elie': 202,
 'loyd': 200,
 'absoutley': 201,
 'jersey': 40,
 'baffles': 203,
 'ornithologist': 204,
 'stribor': 57560,
 'novice': 205,
 'despises': 58017,
 'hopalong': 206,
 'underscoring': 207,
 'gorgeous': 208,
 'enrol': 209,
 'esqe': 210,
 'purcell': 73527,
 'casinos': 217,
 'imogene': 12434,
 'supress': 214,
 'ring': 215,
 'stallions': 216,
 'unprovocked': 24596,
 'wronged': 218,
 'hardboiled': 219,
 'secede': 37171,
 'enfolds': 228,
 'ceramic': 221,
 'posterchild': 222,
 'higres': 227,
 'booing': 230,
 'feasted': 226,
 'motherdid': 69663,
 'caratherisic': 229,
 'belgian': 18142,
 'gruntled': 232,
 'motown': 233,
 'palpitation': 62343,
 'chime': 234,
 'abridge': 235,
 'prolonging': 72719,
 'bishop': 236,
 'showcased': 237,
 'luiz': 238,
 'sinned': 240,
 'claiborne': 46890,
 'victoriain': 241,
 'regime': 244,
 'whiskey': 243,
 'riso': 56932,
 'setna': 247,
 'scrapbook': 248,
 'shortchanging': 47,
 'consumers': 249,
 'corresponds': 24602,
 'sren': 250,
 'glaudini': 251,
 'clunes': 252,
 'video': 253,
 'colorful': 254,
 'muscial': 255,
 'doncha': 260,
 'reverberate': 258,
 'confuses': 259,
 'muffin': 261,
 'tract': 265,
 'relieves': 263,
 'glories': 264,
 'politburo': 269,
 'kafka': 267,
 'soliloquies': 268,
 'songbook': 68816,
 'premade': 271,
 'stone': 73077,
 'vanner': 272,
 'pyromaniac': 60306,
 'avent': 12446,
 'extraordinarily': 273,
 'insightful': 274,
 'aficionado': 275,
 'defiantly': 276,
 'exeggcute': 277,
 'tolkien': 280,
 'mastroianni': 281,
 'battered': 282,
 'irresolute': 61812,
 'hassan': 283,
 'calamitous': 284,
 'glassily': 42702,
 'evergreen': 285,
 'metasonix': 61815,
 'dramatize': 286,
 'breaths': 288,
 'by': 69408,
 'bell': 290,
 'virtuostic': 51466,
 'riveting': 24610,
 'sheik': 291,
 'coldly': 292,
 'summers': 293,
 'gaudenzi': 294,
 'valderamma': 295,
 'sereneness': 18695,
 'caterpillar': 12450,
 'leisure': 297,
 'costell': 298,
 'masauki': 68207,
 'mogul': 300,
 'seijun': 301,
 'millardo': 308,
 'leina': 303,
 'hacienda': 310,
 'roshambo': 305,
 'staunch': 307,
 'impenetrable': 311,
 'tacitly': 312,
 'torchy': 314,
 'eyeshadow': 12453,
 'couric': 316,
 'ravelling': 24613,
 'swaile': 52475,
 'weissmuller': 317,
 'frider': 318,
 'spyl': 321,
 'pando': 320,
 'trembling': 322,
 'favorit': 323,
 'builder': 324,
 'chetniks': 24619,
 'roz': 325,
 'shearer': 326,
 'consoling': 61822,
 'spierlberg': 329,
 'overhauled': 331,
 'deconstruction': 332,
 'dealing': 333,
 'vadas': 334,
 'wuxia': 335,
 'lectern': 337,
 'abducts': 338,
 'angers': 12458,
 'myrna': 340,
 'mesake': 341,
 'castro': 61832,
 'murdoch': 342,
 'flaccid': 55,
 'fascinated': 344,
 'surprisingly': 345,
 'orignal': 49525,
 'three': 346,
 'karfreitag': 347,
 'infected': 348,
 'coronel': 352,
 'shara': 350,
 'shuddered': 351,
 'ait': 354,
 'liar': 355,
 'bittinger': 356,
 'psychoanalyzing': 357,
 'cattleman': 358,
 'tetsuoooo': 360,
 'colcollins': 61837,
 'composers': 361,
 'borje': 362,
 'laguna': 363,
 'skeptically': 364,
 'unnervingly': 365,
 'seberg': 366,
 'directional': 367,
 'unneccesary': 368,
 'luxembourg': 369,
 'celario': 371,
 'belle': 39393,
 'trust': 372,
 'libyan': 376,
 'trask': 374,
 'inspite': 378,
 'voluble': 61838,
 'pazienza': 377,
 'rosnelski': 380,
 'reah': 381,
 'montenegrin': 67871,
 'cinders': 385,
 'gifford': 384,
 'hamish': 386,
 'nathalie': 387,
 'sharpish': 389,
 'launchers': 24626,
 'kornman': 390,
 'nutshell': 391,
 'skogland': 394,
 'sorenson': 393,
 'antagonistic': 395,
 'challenging': 58643,
 'sainsburys': 24628,
 'unjaded': 12466,
 'readjusts': 398,
 'messrs': 397,
 'psychiatry': 399,
 'gunfight': 402,
 'thunders': 62820,
 'anomalies': 49531,
 'pedicab': 403,
 'reverend': 37195,
 'homour': 404,
 'overdosed': 31033,
 'squirty': 405,
 'uncooked': 406,
 'curves': 37196,
 'dragos': 407,
 'finis': 408,
 'sunscreen': 409,
 'screamed': 410,
 'nit': 411,
 'underly': 412,
 'leave': 413,
 'bang': 414,
 'bankol': 61846,
 'riffen': 415,
 'northeastern': 417,
 'clue': 418,
 'demoiselle': 48627,
 'antitrust': 419,
 'christs': 420,
 'picnics': 421,
 'nieces': 424,
 'squares': 423,
 'cote': 37198,
 'pogany': 61848,
 'shredded': 427,
 'gobo': 426,
 'laughing': 57018,
 'bruce': 428,
 'computational': 435,
 'herbet': 431,
 'complement': 432,
 'grod': 433,
 'johars': 434,
 'transsexual': 436,
 'beginner': 437,
 'screaming': 37203,
 'herek': 438,
 'ransacking': 446,
 'blueberry': 443,
 'hessling': 441,
 'vallette': 445,
 'mono': 448,
 'phobias': 449,
 'chides': 24636,
 'infraction': 451,
 'unarguably': 61852,
 'creature': 452,
 'antonik': 74,
 'xmas': 454,
 'vigor': 457,
 'rail': 459,
 'trucking': 463,
 'exclusive': 462,
 'spooks': 464,
 'nekkidness': 465,
 'borlenghi': 466,
 'gut': 467,
 'neorealist': 468,
 'colours': 61855,
 'somersault': 472,
 'cortner': 473,
 'earthier': 475,
 'especialy': 49539,
 'umm': 476,
 'flounder': 75,
 'speakman': 478,
 'iaido': 24639,
 'dayton': 479,
 'momentarily': 483,
 'funiest': 481,
 'reflexive': 482,
 'ugliest': 484,
 'binso': 485,
 'realistically': 486,
 'yacca': 487,
 'driven': 488,
 'invokes': 489,
 'clavius': 491,
 'kapor': 492,
 'anxiety': 494,
 'besson': 495,
 'surplus': 496,
 'cornball': 497,
 'flooded': 498,
 'fai': 502,
 'wrecker': 500,
 'servicable': 503,
 'reedus': 508,
 'diligent': 504,
 'uncredited': 505,
 'webbed': 511,
 'hilariousness': 507,
 'wookie': 24643,
 'heinz': 509,
 'obcession': 79,
 'uncomprehensible': 515,
 'parrots': 513,
 'chests': 518,
 'forklift': 516,
 'nigel': 517,
 'dheeraj': 73086,
 'zippy': 519,
 'leafy': 520,
 'aadha': 521,
 'philly': 62114,
 'pedophile': 49545,
 'fothergill': 522,
 'deceitfulness': 523,
 'seawall': 524,
 'tazmainian': 525,
 'sargoth': 526,
 'bulkhead': 527,
 'halsslag': 528,
 'mikes': 529,
 'tuberculosis': 531,
 'craziness': 33539,
 'chanson': 71520,
 'unified': 12490,
 'whitehall': 532,
 'cartwheel': 533,
 'overanxious': 58095,
 'velvety': 534,
 'nikos': 52147,
 'berzier': 63001,
 'pe': 24651,
 'fickleness': 535,
 'nuttiest': 70825,
 'pluperfect': 536,
 'quarantines': 537,
 'mcgraw': 538,
 'atkine': 539,
 'wringing': 540,
 'shilton': 542,
 'oedepus': 61866,
 'burrier': 543,
 'cranes': 49548,
 'brewing': 58167,
 'nefretiri': 544,
 'dugout': 61867,
 'homes': 545,
 'grounded': 548,
 'connolly': 547,
 'stirba': 90,
 'mplayer': 550,
 'betray': 551,
 'arjuna': 552,
 'breaker': 553,
 'converge': 554,
 'cowen': 555,
 'aboriginies': 24658,
 'bonnet': 556,
 'discs': 557,
 'slamming': 24659,
 'mallet': 559,
 'swit': 560,
 'unescapably': 561,
 'clinical': 562,
 'deadness': 564,
 'tenement': 565,
 'tarazu': 58430,
 'unusal': 24870,
 'jud': 566,
 'northram': 66990,
 'indicted': 567,
 'kadeem': 573,
 'retired': 570,
 'ole': 571,
 'gloss': 572,
 'mgs': 575,
 'degli': 49553,
 'prety': 576,
 'leaven': 578,
 'moderne': 18709,
 'theatregoers': 579,
 'inverted': 582,
 'simper': 581,
 'succeed': 59016,
 'assasain': 583,
 'huck': 584,
 'features': 585,
 'machinations': 37227,
 'dumbss': 586,
 'charolette': 587,
 'brethren': 12501,
 'beijing': 588,
 'tiw': 30290,
 'plotters': 61870,
 'nodes': 589,
 'bloke': 590,
 'sermonizing': 591,
 'pedophiliac': 73369,
 'teachers': 95,
 'imitation': 592,
 'toys': 61875,
 'deteriorates': 593,
 'barbapapa': 596,
 'rob': 595,
 'napunsaktha': 67726,
 'dorrit': 52154,
 'proctology': 597,
 'squeeze': 598,
 'reunions': 599,
 'chamcha': 70745,
 'lyduschka': 600,
 'joyriding': 601,
 'alisande': 12508,
 'brisco': 602,
 'freezer': 603,
 'caimano': 604,
 'kove': 605,
 'hollywod': 609,
 'served': 607,
 'griefs': 608,
 'timesfunny': 610,
 'mfn': 611,
 'markell': 612,
 'chiles': 613,
 'brolin': 99,
 'hodgins': 615,
 'constained': 51786,
 'beginning': 617,
 'eurpeans': 61880,
 'spouses': 73174,
 'straight': 618,
 'hatian': 619,
 'authorty': 623,
 'medusans': 621,
 'saddled': 622,
 'peeve': 59845,
 'wippleman': 624,
 'douses': 627,
 'drilling': 629,
 'milimeters': 630,
 'bumpuses': 635,
 'prowlin': 29313,
 'messanger': 633,
 'clifford': 12514,
 'disovered': 636,
 'elevates': 104,
 'acrid': 639,
 'rationally': 647,
 'accredited': 641,
 'salary': 642,
 'pencier': 643,
 'blown': 644,
 'unify': 645,
 'lodgings': 646,
 'dof': 649,
 'merciful': 650,
 'intruder': 654,
 'borga': 657,
 'understorey': 653,
 'denigrated': 655,
 'tampax': 656,
 'villian': 658,
 'hards': 659,
 'ferrara': 24679,
 'depleted': 661,
 'ramallo': 12516,
 'serving': 662,
 'cremate': 663,
 'shopgirl': 664,
 'accompany': 665,
 'immaturity': 666,
 'autocue': 667,
 'jaffer': 668,
 'bunnie': 669,
 'rackham': 25394,
 'cyberspace': 671,
 'dissipating': 51031,
 'mega': 37242,
 'testify': 673,
 'destruction': 674,
 'tila': 677,
 'reminiscent': 679,
 'wallbangers': 681,
 'asphyxiated': 682,
 'intentionally': 683,
 'cobra': 61886,
 'nekromantik': 684,
 'spurn': 685,
 'abduction': 37980,
 'comparisons': 112,
 'vampyros': 34335,
 'contrary': 686,
 'admiration': 687,
 'inhabitant': 688,
 'banks': 689,
 'themthe': 691,
 'alderson': 33473,
 'underhandedness': 692,
 'uphold': 69319,
 'halbert': 693,
 'colin': 694,
 'pumped': 695,
 'vovchenko': 696,
 'montreux': 116,
 'fowler': 702,
 'girl': 68327,
 'linn': 703,
 'montoss': 63048,
 'meridian': 704,
 'unlisted': 118,
 'sportscaster': 706,
 'trifled': 707,
 'secularity': 708,
 'shaggy': 709,
 'afortunately': 710,
 'cure': 711,
 'tricia': 713,
 'bunched': 714,
 'option': 49568,
 'linklater': 37251,
 'disagreeable': 718,
 'draped': 719,
 'dramatized': 720,
 'guffaw': 721,
 'zooms': 53698,
 'bonin': 724,
 'vegetate': 725,
 'dig': 726,
 'thier': 121,
 'trampled': 728,
 'dabrova': 729,
 'unwillingly': 730,
 'anywhere': 731,
 'navin': 732,
 'barnet': 733,
 'faultless': 37254,
 'waterloo': 735,
 'horsing': 736,
 'espouse': 49573,
 'statistically': 737,
 'mise': 24690,
 'benchley': 738,
 'sloooow': 739,
 'surmise': 740,
 'insinuates': 741,
 'colbet': 742,
 'underwent': 73868,
 'maloney': 743,
 'crashers': 744,
 'sizeable': 745,
 'cryo': 24691,
 'extramarital': 747,
 'nineveh': 748,
 'obtrusively': 61894,
 'herded': 749,
 'expansion': 750,
 'carmella': 752,
 'porridge': 753,
 'vamping': 755,
 'condenses': 756,
 'enliven': 757,
 'shoestring': 49579,
 'erroll': 24903,
 'pingo': 758,
 'stacie': 759,
 'parter': 761,
 'bloodrayne': 762,
 'firebombing': 763,
 'nist': 764,
 'friggin': 766,
 'tomorrowland': 770,
 'laundromat': 768,
 'joao': 771,
 'mired': 24699,
 'henley': 774,
 'bloodline': 773,
 'britain': 775,
 'uesa': 780,
 'loyalk': 778,
 'reused': 49583,
 'uns': 66854,
 'lynched': 781,
 'ruinously': 782,
 'storytelling': 783,
 'distinctively': 784,
 'teasingly': 786,
 'latifah': 787,
 'gazelles': 788,
 'oats': 789,
 'truest': 129,
 'yoshiyuki': 790,
 'brandishes': 791,
 'oughta': 792,
 'adaptaion': 12541,
 'coughthe': 793,
 'satanism': 12914,
 'knucklehead': 801,
 'castrol': 798,
 'trini': 799,
 'ingredients': 800,
 'naive': 803,
 'morto': 807,
 'twisters': 805,
 'harming': 806,
 'wackier': 808,
 'ceramics': 809,
 'ospenskya': 810,
 'crochet': 811,
 'machievellian': 812,
 'iafrika': 813,
 'regresses': 815,
 'toonami': 816,
 'emblazoned': 817,
 'kasch': 818,
 'describing': 24706,
 'shoulder': 820,
 'future': 822,
 'harried': 825,
 'cobb': 824,
 'condor': 831,
 'chuckles': 827,
 'domingo': 829,
 'artfulness': 12546,
 'slevin': 832,
 'alamothirteen': 72561,
 'trampling': 833,
 'suborned': 834,
 'array': 835,
 'subset': 836,
 'salin': 839,
 'redefined': 61904,
 'hare': 45243,
 'syringes': 840,
 'mexicans': 841,
 'kasam': 850,
 'glazed': 5536,
 'jadoo': 844,
 'headache': 845,
 'krocodylus': 848,
 'innermost': 61908,
 'refractive': 851,
 'chaingun': 854,
 'dogie': 853,
 'unbounded': 855,
 'uppance': 857,
 'ramme': 49597,
 'oldies': 859,
 'bloodsucking': 860,
 'baddy': 61909,
 'switzerland': 862,
 'bales': 863,
 'culpable': 864,
 'constraints': 865,
 'deutsch': 866,
 'excursion': 867,
 'pockets': 868,
 'dysantry': 870,
 'ekland': 24713,
 'dumber': 871,
 'leafs': 872,
 'hault': 137,
 'fantasythey': 12928,
 'bared': 875,
 'okey': 877,
 'exempt': 878,
 'madama': 879,
 'awesomenes': 880,
 'hobgobblins': 12554,
 'accessorizing': 881,
 'dorsey': 71328,
 'deplorable': 10761,
 'popsicles': 882,
 'embody': 883,
 'reciprocate': 884,
 'lynton': 885,
 'foreign': 887,
 'okw': 890,
 'decency': 63143,
 'careering': 67586,
 'backlashes': 891,
 'endemic': 13074,
 'bhi': 895,
 'sendup': 894,
 'charts': 896,
 'churchman': 898,
 'devestated': 49604,
 'landauer': 144,
 'irvin': 904,
 'akroyd': 901,
 'federal': 902,
 'merging': 903,
 'anatole': 905,
 'lorne': 906,
 'graduation': 911,
 'aidsssss': 909,
 'embassy': 910,
 'majorca': 54034,
 'afilm': 912,
 'appoint': 914,
 'heth': 915,
 'regrettably': 916,
 'posterity': 917,
 'dormant': 918,
 'sacrificing': 919,
 'albizo': 920,
 'waterstone': 72580,
 'rigors': 24722,
 'wholes': 924,
 'donation': 922,
 'rinse': 148,
 'soderberghian': 925,
 'sequenced': 931,
 'midget': 927,
 'ips': 932,
 'blowsy': 930,
 'shivers': 933,
 'sahay': 935,
 'electrocuted': 936,
 'simplebut': 937,
 'trice': 938,
 'ported': 940,
 'pitchforks': 941,
 'showers': 37282,
 'zeroing': 943,
 'jerilderie': 944,
 'genvieve': 945,
 'excretable': 946,
 'cifaretto': 947,
 'appallingly': 948,
 'ciphers': 949,
 'tibetan': 950,
 'asthmatic': 61921,
 'loretta': 952,
 'aboutagirly': 953,
 'detriments': 955,
 'kinsella': 956,
 'spells': 957,
 'gwyenths': 958,
 'delhi': 959,
 'doa': 960,
 'uninjured': 962,
 'sal': 963,
 'pitfalls': 964,
 'bottomless': 965,
 'befuddlement': 24725,
 'brood': 966,
 'ludwig': 32860,
 'shrug': 968,
 'anonymous': 969,
 'shiksa': 970,
 'deceives': 52089,
 'communist': 971,
 'khakis': 972,
 'sumitra': 56867,
 'logline': 973,
 'sorcery': 974,
 'sharmila': 975,
 'hoyts': 976,
 'rainforests': 978,
 'pazo': 979,
 'ostentation': 980,
 'chickenpox': 51368,
 'ultimately': 981,
 'hissing': 61924,
 'urinate': 983,
 'kaddiddlehopper': 984,
 'moonwalks': 985,
 'martinets': 986,
 'blimey': 68311,
 'heartstrings': 155,
 'bracho': 988,
 'surveillance': 49617,
 'taipei': 989,
 'payout': 37295,
 'giller': 990,
 'prometheus': 991,
 'motored': 992,
 'impressionists': 995,
 'crunchy': 994,
 'jew': 49324,
 'replies': 996,
 'zardoz': 61926,
 'recognizable': 997,
 'followed': 998,
 'dashiell': 28398,
 'jannsen': 24731,
 'waaaaaayyyy': 1002,
 'scalpels': 1004,
 'doubles': 1005,
 'ascend': 1006,
 'tangible': 1007,
 ...}

...and now we can use this new "word2index" dictionary to populate our input layer with the right 1s in the right places.


In [16]:
def update_input_layer(review):
    
    global layer_0
    
    # clear out previous state, reset the layer to be all 0s
    layer_0 *= 0
    for word in review.split(" "):
        layer_0[0][word2index[word]] = 1

update_input_layer(reviews[0])

In [17]:
layer_0


Out[17]:
array([[ 1.,  0.,  0., ...,  0.,  0.,  0.]])

Creating the Target Data


In [18]:
def get_target_for_label(label):
    if(label == 'POSITIVE'):
        return 1
    else:
        return 0

In [19]:
get_target_for_label(labels[0])


Out[19]:
1

In [20]:
get_target_for_label(labels[1])


Out[20]:
0

Putting it all together in a Neural Network


In [21]:
from IPython.display import Image
Image(filename='sentiment_network_2.png')


Out[21]:

In [22]:
import time
import sys
import numpy as np

# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
       
        np.random.seed(1)
    
        self.pre_process_data()
        
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
        
    def pre_process_data(self):
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1,input_nodes))
    
        
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] = 1
                
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews, training_labels):
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)

            # Output layer
            layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))

            #### Implement the backward pass here ####
            ### Backward pass ###

            # TODO: Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # TODO: Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # TODO: Update the weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
        
    
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        
        # Input Layer
        self.update_input_layer(review.lower())

        # Hidden layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        # Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] > 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [23]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000])

In [24]:
# evaluate our model before training (just to show how horrible it is)
mlp.test(reviews[-1000:],labels[-1000:])


Progress:99.9% Speed(reviews/sec):786.4% #Correct:500 #Tested:1000 Testing Accuracy:50.0%

In [25]:
# train the network
mlp.train(reviews[:-1000],labels[:-1000])


Progress:99.9% Speed(reviews/sec):116.0 #Correct:20073 #Trained:24000 Training Accuracy:83.6%

In [26]:
# evaluate the model after training
mlp.test(reviews[-1000:],labels[-1000:])


Progress:99.9% Speed(reviews/sec):814.7% #Correct:856 #Tested:1000 Testing Accuracy:85.6%

In [27]:
mlp.run("That movie was great")


Out[27]:
'POSITIVE'

Making our Network Train and Run Faster


In [28]:
layer_0 = np.zeros(10)

In [29]:
layer_0


Out[29]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [30]:
layer_0[4] = 1
layer_0[9] = 1
layer_0


Out[30]:
array([ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.])

In [31]:
weights_0_1 = np.random.randn(10,5)

In [32]:
layer_1 = layer_0.dot(weights_0_1)

In [33]:
layer_1


Out[33]:
array([-0.10503756,  0.44222989,  0.24392938, -0.55961832,  0.21389503])

In [34]:
Image(filename='sentiment_network_sparse.png')


Out[34]:

First Inefficiency: "0" neurons waste computation


In [35]:
Image(filename='sentiment_network_sparse_2.png')


Out[35]:

Second Inefficiency: "1" neurons don't need to multiply!

The Solution: Create layer_1 by adding the vectors for each word.


In [36]:
#inefficient thing we did before

layer_1 = layer_0.dot(weights_0_1)
layer_1


Out[36]:
array([-0.10503756,  0.44222989,  0.24392938, -0.55961832,  0.21389503])

In [37]:
# new, less expensive lookup table version

layer_1 = weights_0_1[4] + weights_0_1[9]
layer_1


Out[37]:
array([-0.10503756,  0.44222989,  0.24392938, -0.55961832,  0.21389503])

See how they generate exactly the same value? Let's update our new neural network to do this.


In [69]:
import time
import sys

# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
       
        np.random.seed(1)
    
        self.pre_process_data(reviews)
        
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
        
    def pre_process_data(self,reviews):
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1,input_nodes))
        self.layer_1 = np.zeros((1,hidden_nodes))
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            self.layer_0[0][self.word2index[word]] = 1

    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def train(self, training_reviews_raw, training_labels):
        
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer

            # Hidden layer
#             layer_1 = self.layer_0.dot(self.weights_0_1)
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            
            # Output layer
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))

            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
        
    
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        
        # Input Layer


        # Hidden layer
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] > 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [70]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],learning_rate=0.01)

In [71]:
# train the network
mlp_full.train(reviews[:-1000],labels[:-1000])


Progress:99.9% Speed(reviews/sec):1478. #Correct:20334 #Trained:24000 Training Accuracy:84.7%

In [72]:
# evaluate our model before training (just to show how horrible it is)
mlp_full.test(reviews[-1000:],labels[-1000:])


Progress:99.9% Speed(reviews/sec):1958.% #Correct:856 #Tested:1000 Testing Accuracy:85.6%

Making Learning Faster & Easier by Reducing Noise


In [42]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common()


Out[42]:
[('edie', 4.6913478822291435),
 ('antwone', 4.4773368144782069),
 ('din', 4.4067192472642533),
 ('gunga', 4.1896547420264252),
 ('goldsworthy', 4.1743872698956368),
 ('gypo', 4.0943445622221004),
 ('yokai', 4.0943445622221004),
 ('paulie', 4.0775374439057197),
 ('visconti', 3.9318256327243257),
 ('flavia', 3.9318256327243257),
 ('kells', 3.8712010109078911),
 ('blandings', 3.8712010109078911),
 ('brashear', 3.8501476017100584),
 ('gino', 3.8286413964890951),
 ('deathtrap', 3.8066624897703196),
 ('panahi', 3.713572066704308),
 ('harilal', 3.713572066704308),
 ('ossessione', 3.6635616461296463),
 ('tsui', 3.6375861597263857),
 ('caruso', 3.6375861597263857),
 ('sabu', 3.6109179126442243),
 ('ahmad', 3.6109179126442243),
 ('khouri', 3.5835189384561099),
 ('dominick', 3.5835189384561099),
 ('aweigh', 3.5553480614894135),
 ('mj', 3.5553480614894135),
 ('mcintire', 3.5263605246161616),
 ('kriemhild', 3.5263605246161616),
 ('blackie', 3.4965075614664802),
 ('newcombe', 3.4965075614664802),
 ('daisies', 3.4965075614664802),
 ('trelkovsky', 3.4657359027997265),
 ('kei', 3.4657359027997265),
 ('jaffar', 3.4339872044851463),
 ('hilliard', 3.4339872044851463),
 ('gundam', 3.4231762883809305),
 ('sheeta', 3.4011973816621555),
 ('pazu', 3.4011973816621555),
 ('bathsheba', 3.4011973816621555),
 ('krell', 3.4011973816621555),
 ('offside', 3.4011973816621555),
 ('fineman', 3.3672958299864741),
 ('venoms', 3.3672958299864741),
 ('ranma', 3.3322045101752038),
 ('ronny', 3.3322045101752038),
 ('paine', 3.3322045101752038),
 ('pimlico', 3.3322045101752038),
 ('abhay', 3.2958368660043291),
 ('iturbi', 3.2771447329921766),
 ('pym', 3.2580965380214821),
 ('kipling', 3.2580965380214821),
 ('audiard', 3.2188758248682006),
 ('kelso', 3.2188758248682006),
 ('milverton', 3.2188758248682006),
 ('scalise', 3.2188758248682006),
 ('gabe', 3.2188758248682006),
 ('feinstone', 3.1780538303479458),
 ('mukhsin', 3.1780538303479458),
 ('grisby', 3.1780538303479458),
 ('xica', 3.1780538303479458),
 ('moonwalker', 3.1780538303479458),
 ('giovanna', 3.1780538303479458),
 ('felix', 3.1527360223636558),
 ('togar', 3.1354942159291497),
 ('chikatilo', 3.1354942159291497),
 ('heaton', 3.1354942159291497),
 ('jannings', 3.1354942159291497),
 ('luzhin', 3.1135153092103742),
 ('pidgeon', 3.0910424533583161),
 ('matuschek', 3.0910424533583161),
 ('miklos', 3.0910424533583161),
 ('soha', 3.0910424533583161),
 ('fanfan', 3.0910424533583161),
 ('desdemona', 3.0910424533583161),
 ('matador', 3.0910424533583161),
 ('leonora', 3.0910424533583161),
 ('philo', 3.068052935133617),
 ('microfilm', 3.044522437723423),
 ('firemen', 3.044522437723423),
 ('gauri', 3.044522437723423),
 ('lindy', 3.044522437723423),
 ('maradona', 3.044522437723423),
 ('bjm', 3.044522437723423),
 ('joss', 3.044522437723423),
 ('reda', 3.044522437723423),
 ('capote', 3.0122615755052013),
 ('fido', 3.0081547935525483),
 ('mcintyre', 2.9957322735539909),
 ('prote', 2.9957322735539909),
 ('siegfried', 2.9957322735539909),
 ('emory', 2.9957322735539909),
 ('coonskin', 2.9957322735539909),
 ('quibble', 2.9957322735539909),
 ('carrre', 2.9957322735539909),
 ('coe', 2.9957322735539909),
 ('excellently', 2.9789251552376097),
 ('clutter', 2.9704144655697009),
 ('pakeezah', 2.9444389791664403),
 ('ferdie', 2.9444389791664403),
 ('ackland', 2.9444389791664403),
 ('anchors', 2.9444389791664403),
 ('baloo', 2.9444389791664403),
 ('knockout', 2.9444389791664403),
 ('burakov', 2.9444389791664403),
 ('rvd', 2.9444389791664403),
 ('railly', 2.9444389791664403),
 ('schlesinger', 2.9444389791664403),
 ('flippen', 2.9444389791664403),
 ('pinjar', 2.9444389791664403),
 ('digicorp', 2.9444389791664403),
 ('hillyer', 2.9444389791664403),
 ('vance', 2.9444389791664403),
 ('magnus', 2.9444389791664403),
 ('petiot', 2.9444389791664403),
 ('versatility', 2.9444389791664403),
 ('malfique', 2.9444389791664403),
 ('kolchak', 2.9311937524164198),
 ('hayworth', 2.9267394020670396),
 ('deanna', 2.9267394020670396),
 ('iek', 2.8903717578961645),
 ('lando', 2.8903717578961645),
 ('geer', 2.8903717578961645),
 ('burgade', 2.8903717578961645),
 ('falco', 2.8903717578961645),
 ('pollak', 2.8903717578961645),
 ('guerrero', 2.8903717578961645),
 ('hobson', 2.8903717578961645),
 ('pappas', 2.8903717578961645),
 ('volckman', 2.8903717578961645),
 ('hoechlin', 2.8903717578961645),
 ('iphigenia', 2.8903717578961645),
 ('korda', 2.8622008809294686),
 ('sammo', 2.8526314299133175),
 ('orked', 2.8332133440562162),
 ('nighy', 2.8332133440562162),
 ('cdric', 2.8332133440562162),
 ('laputa', 2.8332133440562162),
 ('naudet', 2.8332133440562162),
 ('callahan', 2.8332133440562162),
 ('beckett', 2.8332133440562162),
 ('biko', 2.8332133440562162),
 ('jeon', 2.8332133440562162),
 ('kralik', 2.8332133440562162),
 ('peralta', 2.8332133440562162),
 ('nagra', 2.8332133440562162),
 ('jacknife', 2.8332133440562162),
 ('toughness', 2.8332133440562162),
 ('hewlett', 2.8332133440562162),
 ('sox', 2.8332133440562162),
 ('polanski', 2.8233610476132043),
 ('alvin', 2.8183982582710754),
 ('matthau', 2.8067217286092401),
 ('aiello', 2.8033603809065348),
 ('gaiman', 2.7725887222397811),
 ('endor', 2.7725887222397811),
 ('janos', 2.7725887222397811),
 ('rotj', 2.7725887222397811),
 ('yvaine', 2.7725887222397811),
 ('macready', 2.7725887222397811),
 ('hulce', 2.7725887222397811),
 ('firefighter', 2.7725887222397811),
 ('mathieu', 2.7725887222397811),
 ('delpy', 2.7725887222397811),
 ('coulouris', 2.7725887222397811),
 ('natalia', 2.7725887222397811),
 ('bedknobs', 2.7725887222397811),
 ('bombshells', 2.7725887222397811),
 ('duffell', 2.7725887222397811),
 ('santos', 2.7725887222397811),
 ('antz', 2.7725887222397811),
 ('gackt', 2.7515353130419489),
 ('myrtle', 2.7515353130419489),
 ('adele', 2.7515353130419489),
 ('bake', 2.7408400239252009),
 ('gilliam', 2.7245795030534206),
 ('soutendijk', 2.7080502011022101),
 ('doktor', 2.7080502011022101),
 ('shintaro', 2.7080502011022101),
 ('burman', 2.7080502011022101),
 ('hilda', 2.7080502011022101),
 ('johnnie', 2.7080502011022101),
 ('baton', 2.7080502011022101),
 ('cognac', 2.7080502011022101),
 ('gunbuster', 2.7080502011022101),
 ('silberling', 2.7080502011022101),
 ('ashraf', 2.7080502011022101),
 ('gannon', 2.7080502011022101),
 ('uld', 2.7080502011022101),
 ('lanisha', 2.7080502011022101),
 ('ballantine', 2.7080502011022101),
 ('hickock', 2.7080502011022101),
 ('aviv', 2.7080502011022101),
 ('lian', 2.7080502011022101),
 ('bernsen', 2.7080502011022101),
 ('karas', 2.7080502011022101),
 ('egon', 2.7080502011022101),
 ('parador', 2.7080502011022101),
 ('oro', 2.7080502011022101),
 ('eustache', 2.7080502011022101),
 ('cheh', 2.7080502011022101),
 ('mcanally', 2.7080502011022101),
 ('calamai', 2.7080502011022101),
 ('kiley', 2.7080502011022101),
 ('goines', 2.7080502011022101),
 ('rotoscoped', 2.7080502011022101),
 ('schildkraut', 2.7080502011022101),
 ('grasshoppers', 2.7080502011022101),
 ('valette', 2.7080502011022101),
 ('victoria', 2.6810215287142909),
 ('partition', 2.6741486494265287),
 ('dench', 2.6741486494265287),
 ('conroy', 2.6741486494265287),
 ('beery', 2.6741486494265287),
 ('chavez', 2.6672282065819548),
 ('ratso', 2.653241964607215),
 ('atoz', 2.6390573296152584),
 ('kabei', 2.6390573296152584),
 ('kulkarni', 2.6390573296152584),
 ('rien', 2.6390573296152584),
 ('gardenia', 2.6390573296152584),
 ('emy', 2.6390573296152584),
 ('megs', 2.6390573296152584),
 ('hickam', 2.6390573296152584),
 ('danelia', 2.6390573296152584),
 ('duprez', 2.6390573296152584),
 ('embezzler', 2.6390573296152584),
 ('fetisov', 2.6390573296152584),
 ('treaty', 2.6390573296152584),
 ('scrat', 2.6390573296152584),
 ('laine', 2.6390573296152584),
 ('gialli', 2.6390573296152584),
 ('cb', 2.6390573296152584),
 ('ishwar', 2.6390573296152584),
 ('cartwrights', 2.6390573296152584),
 ('ingram', 2.6390573296152584),
 ('harriet', 2.6390573296152584),
 ('pang', 2.6390573296152584),
 ('melancholic', 2.6390573296152584),
 ('intricately', 2.6390573296152584),
 ('bathhouse', 2.6390573296152584),
 ('pilgrimage', 2.6390573296152584),
 ('tulip', 2.6390573296152584),
 ('beek', 2.6390573296152584),
 ('katsu', 2.6026896854443837),
 ('mildred', 2.6026896854443837),
 ('ultimatum', 2.6026896854443837),
 ('dev', 2.6026896854443837),
 ('fricker', 2.6026896854443837),
 ('emil', 2.6026896854443837),
 ('mclaglen', 2.5649493574615367),
 ('girotti', 2.5649493574615367),
 ('goring', 2.5649493574615367),
 ('guadalcanal', 2.5649493574615367),
 ('oakie', 2.5649493574615367),
 ('broadbent', 2.5649493574615367),
 ('sugiyama', 2.5649493574615367),
 ('tissues', 2.5649493574615367),
 ('luchino', 2.5649493574615367),
 ('nibelungen', 2.5649493574615367),
 ('cynics', 2.5649493574615367),
 ('mcdoakes', 2.5649493574615367),
 ('adjani', 2.5649493574615367),
 ('freebird', 2.5649493574615367),
 ('autograph', 2.5649493574615367),
 ('riget', 2.5649493574615367),
 ('odysseus', 2.5649493574615367),
 ('brownstone', 2.5649493574615367),
 ('choi', 2.5649493574615367),
 ('unsung', 2.5649493574615367),
 ('chavo', 2.5649493574615367),
 ('bahrain', 2.5649493574615367),
 ('holloway', 2.5649493574615367),
 ('sputnik', 2.5649493574615367),
 ('saura', 2.5649493574615367),
 ('boop', 2.5649493574615367),
 ('eglantine', 2.5649493574615367),
 ('gabriella', 2.5649493574615367),
 ('dola', 2.5649493574615367),
 ('erendira', 2.5649493574615367),
 ('bouvier', 2.5649493574615367),
 ('yelnats', 2.5649493574615367),
 ('corbett', 2.5494451709255714),
 ('warhols', 2.5389738710582761),
 ('gandhi', 2.5389738710582761),
 ('sammi', 2.5389738710582761),
 ('abu', 2.5389738710582761),
 ('zu', 2.5389738710582761),
 ('delightfully', 2.5257286443082556),
 ('sirk', 2.5199979695992702),
 ('rosenstrasse', 2.5123056239761148),
 ('creasy', 2.5055259369907361),
 ('braveheart', 2.5014359517392109),
 ('herge', 2.4849066497880004),
 ('barrister', 2.4849066497880004),
 ('santiago', 2.4849066497880004),
 ('cacoyannis', 2.4849066497880004),
 ('blackadder', 2.4849066497880004),
 ('vierde', 2.4849066497880004),
 ('lassalle', 2.4849066497880004),
 ('parminder', 2.4849066497880004),
 ('hayao', 2.4849066497880004),
 ('trenholm', 2.4849066497880004),
 ('bressart', 2.4849066497880004),
 ('natures', 2.4849066497880004),
 ('presque', 2.4849066497880004),
 ('yuzna', 2.4849066497880004),
 ('lafitte', 2.4849066497880004),
 ('mcadam', 2.4849066497880004),
 ('unpretentious', 2.4849066497880004),
 ('hecht', 2.4849066497880004),
 ('perdition', 2.4849066497880004),
 ('gallico', 2.4849066497880004),
 ('holodeck', 2.4849066497880004),
 ('balduin', 2.4849066497880004),
 ('bouzaglo', 2.4849066497880004),
 ('attila', 2.4849066497880004),
 ('mcphillip', 2.4849066497880004),
 ('kazan', 2.4849066497880004),
 ('rideau', 2.4849066497880004),
 ('luger', 2.4849066497880004),
 ('bischoff', 2.4849066497880004),
 ('poonam', 2.4849066497880004),
 ('talos', 2.4849066497880004),
 ('binder', 2.4849066497880004),
 ('euripides', 2.4849066497880004),
 ('killian', 2.4849066497880004),
 ('lupino', 2.4849066497880004),
 ('yeon', 2.4849066497880004),
 ('strindberg', 2.4849066497880004),
 ('hanlon', 2.4849066497880004),
 ('anselmo', 2.4849066497880004),
 ('clutters', 2.4849066497880004),
 ('vonnegut', 2.4638532405901681),
 ('mccoy', 2.456735772821304),
 ('taker', 2.456735772821304),
 ('flawless', 2.451005098112319),
 ('othello', 2.4485390056171252),
 ('natali', 2.4423470353692043),
 ('abbey', 2.4423470353692043),
 ('godmother', 2.4423470353692043),
 ('judi', 2.4423470353692043),
 ('jonestown', 2.4423470353692043),
 ('mahatma', 2.4423470353692043),
 ('mcnally', 2.4423470353692043),
 ('novak', 2.4361164856185682),
 ('durbin', 2.4277482359480516),
 ('christy', 2.4203681286504293),
 ('cheadle', 2.4159137783010487),
 ('unsurpassed', 2.3978952727983707),
 ('hallen', 2.3978952727983707),
 ('rawhide', 2.3978952727983707),
 ('eisenhower', 2.3978952727983707),
 ('faultless', 2.3978952727983707),
 ('wilhelm', 2.3978952727983707),
 ('vadar', 2.3978952727983707),
 ('capano', 2.3978952727983707),
 ('eminent', 2.3978952727983707),
 ('waterman', 2.3978952727983707),
 ('leaud', 2.3978952727983707),
 ('hanka', 2.3978952727983707),
 ('prologues', 2.3978952727983707),
 ('muska', 2.3978952727983707),
 ('bartel', 2.3978952727983707),
 ('showings', 2.3978952727983707),
 ('nord', 2.3978952727983707),
 ('sweetin', 2.3978952727983707),
 ('rf', 2.3978952727983707),
 ('fellowes', 2.3978952727983707),
 ('fuller', 2.3978952727983707),
 ('faust', 2.3978952727983707),
 ('nyqvist', 2.3978952727983707),
 ('arrondissement', 2.3978952727983707),
 ('shep', 2.3978952727983707),
 ('stockwell', 2.3978952727983707),
 ('radiant', 2.3978952727983707),
 ('dragoon', 2.3978952727983707),
 ('starewicz', 2.3978952727983707),
 ('tetsur', 2.3978952727983707),
 ('ramones', 2.3978952727983707),
 ('cannavale', 2.3978952727983707),
 ('taoist', 2.3978952727983707),
 ('filone', 2.3978952727983707),
 ('dorfman', 2.3978952727983707),
 ('gracia', 2.3978952727983707),
 ('bruhl', 2.3978952727983707),
 ('pei', 2.3978952727983707),
 ('stitzer', 2.3978952727983707),
 ('gorris', 2.3978952727983707),
 ('regent', 2.3978952727983707),
 ('gundams', 2.3978952727983707),
 ('antonietta', 2.3978952727983707),
 ('seine', 2.3978952727983707),
 ('lok', 2.3978952727983707),
 ('huns', 2.3978952727983707),
 ('nandini', 2.3978952727983707),
 ('bolan', 2.3978952727983707),
 ('zp', 2.3978952727983707),
 ('thursby', 2.3978952727983707),
 ('bazza', 2.3978952727983707),
 ('woronov', 2.3978952727983707),
 ('tykwer', 2.3978952727983707),
 ('pike', 2.3978952727983707),
 ('sjoman', 2.3978952727983707),
 ('dorsey', 2.3978952727983707),
 ('cookbook', 2.3978952727983707),
 ('noriko', 2.3978952727983707),
 ('viennese', 2.3978952727983707),
 ('soapdish', 2.3978952727983707),
 ('fanshawe', 2.3978952727983707),
 ('spacecamp', 2.3978952727983707),
 ('atul', 2.3978952727983707),
 ('hoss', 2.3978952727983707),
 ('yasmin', 2.3978952727983707),
 ('pita', 2.367123614131617),
 ('cal', 2.3608540011180215),
 ('reservations', 2.3513752571634776),
 ('elsa', 2.3513752571634776),
 ('schmid', 2.3513752571634776),
 ('oberon', 2.3513752571634776),
 ('johnston', 2.3513752571634776),
 ('marylee', 2.3513752571634776),
 ('brisson', 2.3513752571634776),
 ('winchester', 2.3353749158170367),
 ('jabba', 2.3272777055844172),
 ('chamberlain', 2.3191143949452564),
 ('jud', 2.3025850929940459),
 ('montrose', 2.3025850929940459),
 ('coop', 2.3025850929940459),
 ('bannister', 2.3025850929940459),
 ('homeward', 2.3025850929940459),
 ('hundstage', 2.3025850929940459),
 ('manny', 2.3025850929940459),
 ('colman', 2.3025850929940459),
 ('tigerland', 2.3025850929940459),
 ('ungar', 2.3025850929940459),
 ('girlfight', 2.3025850929940459),
 ('haines', 2.3025850929940459),
 ('rea', 2.3025850929940459),
 ('flamenco', 2.3025850929940459),
 ('carla', 2.2772672850097559),
 ('hanzo', 2.2772672850097559),
 ('fagin', 2.2772672850097559),
 ('sullavan', 2.2686835413183641),
 ('aunts', 2.2686835413183641),
 ('olympia', 2.2686835413183641),
 ('sabrina', 2.2643638801738479),
 ('superbly', 2.2600254785752498),
 ('linklater', 2.2512917986064953),
 ('elia', 2.2512917986064953),
 ('beetle', 2.2512917986064953),
 ('mccartney', 2.2512917986064953),
 ('tully', 2.2512917986064953),
 ('hickok', 2.2512917986064953),
 ('peters', 2.2512917986064953),
 ('sweetly', 2.2512917986064953),
 ('aborigines', 2.2512917986064953),
 ('zentropa', 2.2512917986064953),
 ('gigi', 2.2512917986064953),
 ('northam', 2.2407096892759584),
 ('tomlinson', 2.2335922215070942),
 ('tenant', 2.2284771208403238),
 ('influential', 2.224623551524334),
 ('kalifornia', 2.224623551524334),
 ('stardust', 2.2192034840549946),
 ('kinnear', 2.2155737160044158),
 ('quartier', 2.1972245773362196),
 ('complement', 2.1972245773362196),
 ('hatcher', 2.1972245773362196),
 ('raoul', 2.1972245773362196),
 ('squire', 2.1972245773362196),
 ('vertigo', 2.1972245773362196),
 ('treasured', 2.1972245773362196),
 ('benet', 2.1972245773362196),
 ('magnificently', 2.1972245773362196),
 ('raines', 2.1972245773362196),
 ('finely', 2.1690537003695232),
 ('cheung', 2.1690537003695232),
 ('stevenson', 2.1690537003695232),
 ('georges', 2.1649637151179979),
 ('perfection', 2.1594842493533721),
 ('weir', 2.1594842493533721),
 ('iris', 2.1594842493533721),
 ('marvelously', 2.1594842493533721),
 ('rukh', 2.1594842493533721),
 ('enchanting', 2.1517622032594619),
 ('sailing', 2.1400661634962708),
 ('kennel', 2.1400661634962708),
 ('brodie', 2.1400661634962708),
 ('dixon', 2.1400661634962708),
 ('anand', 2.1400661634962708),
 ('speakeasy', 2.1400661634962708),
 ('celine', 2.1400661634962708),
 ('province', 2.1400661634962708),
 ('astaire', 2.1400661634962708),
 ('pressburger', 2.1400661634962708),
 ('spellbinding', 2.1400661634962708),
 ('leung', 2.1400661634962708),
 ('mahoney', 2.1400661634962708),
 ('curr', 2.1400661634962708),
 ('cartwright', 2.1400661634962708),
 ('trier', 2.1316272948504063),
 ('pecker', 2.1282317058492679),
 ('wodehouse', 2.120263536200091),
 ('miyazaki', 2.120263536200091),
 ('vulnerability', 2.1102132003465894),
 ('darius', 2.1102132003465894),
 ('marjorie', 2.1102132003465894),
 ('hark', 2.1102132003465894),
 ('devotion', 2.1041341542702074),
 ('loomis', 2.0794415416798357),
 ('tackled', 2.0794415416798357),
 ('mores', 2.0794415416798357),
 ('fez', 2.0794415416798357),
 ('benoit', 2.0794415416798357),
 ('braun', 2.0794415416798357),
 ('devos', 2.0794415416798357),
 ('iago', 2.0794415416798357),
 ('boothe', 2.0794415416798357),
 ('romy', 2.0794415416798357),
 ('janeway', 2.0794415416798357),
 ('footlight', 2.0794415416798357),
 ('yang', 2.0794415416798357),
 ('abbot', 2.0794415416798357),
 ('genesis', 2.0794415416798357),
 ('malone', 2.0794415416798357),
 ('duryea', 2.0794415416798357),
 ('lupin', 2.0794415416798357),
 ('cheech', 2.0541237336955462),
 ('mol', 2.0476928433652555),
 ('ella', 2.0476928433652555),
 ('tel', 2.0476928433652555),
 ('askey', 2.0476928433652555),
 ('fairytale', 2.0476928433652555),
 ('captures', 2.0386195471595809),
 ('tintin', 2.0368819272610401),
 ('curtiz', 2.0368819272610401),
 ('duchess', 2.0368819272610401),
 ('quotable', 2.0368819272610401),
 ('edith', 2.0368819272610401),
 ('pickford', 2.0314323224934752),
 ('scoop', 2.0314323224934752),
 ('voight', 2.0301704926730531),
 ('wonderfully', 2.0218960560332353),
 ('patekar', 2.0149030205422647),
 ('richness', 2.0149030205422647),
 ('ephemeral', 2.0149030205422647),
 ('blaine', 2.0149030205422647),
 ('donnell', 2.0149030205422647),
 ('accomplishes', 2.0149030205422647),
 ('archibald', 2.0149030205422647),
 ('stalkers', 2.0149030205422647),
 ('dearly', 2.0149030205422647),
 ('gilley', 2.0149030205422647),
 ('kessler', 2.0149030205422647),
 ('silvio', 2.0149030205422647),
 ('goa', 2.0149030205422647),
 ('saffron', 2.0149030205422647),
 ('brokedown', 2.0149030205422647),
 ('sasha', 2.0149030205422647),
 ('broomsticks', 2.0149030205422647),
 ('milton', 2.0149030205422647),
 ('tetsuo', 2.0149030205422647),
 ('laird', 2.0149030205422647),
 ('stubby', 2.0149030205422647),
 ('zorro', 2.0053335695261141),
 ('clarence', 2.0014800002101243),
 ('dandy', 1.9980959022258835),
 ('bozz', 1.9924301646902061),
 ('pickpocket', 1.9924301646902061),
 ('pawn', 1.9924301646902061),
 ('frailty', 1.9924301646902061),
 ('egan', 1.9924301646902061),
 ('heartbreaking', 1.9924301646902061),
 ('venezuela', 1.9810014688665833),
 ('powell', 1.9783454248084671),
 ('wang', 1.9740810260220096),
 ('brosnan', 1.9547990964725592),
 ('refreshingly', 1.9459101490553132),
 ('hawke', 1.9459101490553132),
 ('aames', 1.9459101490553132),
 ('collinwood', 1.9459101490553132),
 ('henderson', 1.9459101490553132),
 ('meena', 1.9459101490553132),
 ('preity', 1.9459101490553132),
 ('truman', 1.9459101490553132),
 ('argentine', 1.9459101490553132),
 ('trebor', 1.9459101490553132),
 ('carre', 1.9459101490553132),
 ('ida', 1.9459101490553132),
 ('enthralling', 1.9459101490553132),
 ('noll', 1.9459101490553132),
 ('wegener', 1.9459101490553132),
 ('mala', 1.9459101490553132),
 ('francois', 1.9459101490553132),
 ('transcends', 1.9459101490553132),
 ('wtc', 1.9459101490553132),
 ('cratchit', 1.9459101490553132),
 ('singin', 1.9459101490553132),
 ('gingold', 1.9459101490553132),
 ('rathbone', 1.9459101490553132),
 ('strathairn', 1.9459101490553132),
 ('stairway', 1.9459101490553132),
 ('bafta', 1.9459101490553132),
 ('masterful', 1.9459101490553132),
 ('slipper', 1.9459101490553132),
 ('oshii', 1.9459101490553132),
 ('mostel', 1.9459101490553132),
 ('sg', 1.927891643552635),
 ('masterson', 1.927891643552635),
 ('sunrise', 1.9252908618525775),
 ('lily', 1.9203768470501485),
 ('corbin', 1.9169226121820611),
 ('lila', 1.9169226121820611),
 ('turturro', 1.9095425048844386),
 ('adelaide', 1.9095425048844386),
 ('expertly', 1.9095425048844386),
 ('lighten', 1.9095425048844386),
 ('bakshi', 1.9029851043382795),
 ('lincoln', 1.9014583864844796),
 ('pabst', 1.8971199848858813),
 ('exquisitely', 1.8971199848858813),
 ('sumptuous', 1.8971199848858813),
 ('firefighters', 1.8971199848858813),
 ('skagway', 1.8971199848858813),
 ('finlay', 1.8971199848858813),
 ('europa', 1.8971199848858813),
 ('bhandarkar', 1.8971199848858813),
 ('darren', 1.8925641683500207),
 ('chang', 1.8870696490323797),
 ('booker', 1.8870696490323797),
 ('tomei', 1.8827312474337816),
 ('flik', 1.8718021769015913),
 ('iberia', 1.8718021769015913),
 ('kumari', 1.8718021769015913),
 ('maslin', 1.8718021769015913),
 ('lyman', 1.8718021769015913),
 ('lucienne', 1.8718021769015913),
 ('ozzie', 1.8718021769015913),
 ('lucile', 1.8718021769015913),
 ('cassavettes', 1.8718021769015913),
 ('ronda', 1.8718021769015913),
 ('cypher', 1.8718021769015913),
 ('borowczyk', 1.8718021769015913),
 ('labyrinth', 1.8718021769015913),
 ('blindness', 1.8718021769015913),
 ('informer', 1.8718021769015913),
 ('schygulla', 1.8718021769015913),
 ('butterflies', 1.8718021769015913),
 ('paget', 1.8718021769015913),
 ('collaborators', 1.8718021769015913),
 ('coccio', 1.8718021769015913),
 ('superlative', 1.8718021769015913),
 ('aborigine', 1.8718021769015913),
 ('rolle', 1.8718021769015913),
 ('impressionist', 1.8718021769015913),
 ('ellie', 1.8718021769015913),
 ('kher', 1.8718021769015913),
 ('holt', 1.8718021769015913),
 ('solicitor', 1.8718021769015913),
 ('bettany', 1.8718021769015913),
 ('bulimia', 1.8718021769015913),
 ('busby', 1.8718021769015913),
 ('stealer', 1.8718021769015913),
 ('spellbound', 1.8718021769015913),
 ('observant', 1.8718021769015913),
 ('greene', 1.8607523407150064),
 ('carell', 1.8588987720656835),
 ('sematary', 1.8562979903656263),
 ('refreshing', 1.8551812956655511),
 ('montana', 1.8538912503350613),
 ('pegg', 1.8523840910444898),
 ('breathtaking', 1.8481124057791867),
 ('bourne', 1.8478489358790986),
 ('siu', 1.8458266904983307),
 ('prix', 1.8458266904983307),
 ('teri', 1.8458266904983307),
 ('lemmon', 1.8458266904983307),
 ('splendidly', 1.8458266904983307),
 ('twisty', 1.8458266904983307),
 ('uncompromising', 1.8458266904983307),
 ('hardwicke', 1.8458266904983307),
 ('electrifying', 1.8458266904983307),
 ('mendes', 1.8325814637483102),
 ('morbius', 1.8325814637483102),
 ('guiness', 1.8325814637483102),
 ('brock', 1.8325814637483102),
 ('walsh', 1.8325814637483102),
 ('abby', 1.8325814637483102),
 ('zelda', 1.8325814637483102),
 ('zabriskie', 1.824549292051046),
 ('connolly', 1.824549292051046),
 ('keeler', 1.824549292051046),
 ('carface', 1.824549292051046),
 ('batwoman', 1.824549292051046),
 ('vincenzo', 1.8191584434161694),
 ('precise', 1.8152899666382492),
 ('parrot', 1.8152899666382492),
 ('explores', 1.8082887711792655),
 ('steele', 1.8082887711792655),
 ('delightful', 1.8002701588959635),
 ('flynn', 1.7996646487351682),
 ('rafael', 1.791759469228055),
 ('franks', 1.791759469228055),
 ('celebrates', 1.791759469228055),
 ('corsaut', 1.791759469228055),
 ('conor', 1.791759469228055),
 ('breathes', 1.791759469228055),
 ('geddes', 1.791759469228055),
 ('enforced', 1.791759469228055),
 ('perseverance', 1.791759469228055),
 ('colonialism', 1.791759469228055),
 ('demunn', 1.791759469228055),
 ('advent', 1.791759469228055),
 ('tumultuous', 1.791759469228055),
 ('jong', 1.791759469228055),
 ('leia', 1.791759469228055),
 ('auer', 1.791759469228055),
 ('strangler', 1.791759469228055),
 ('culp', 1.791759469228055),
 ('weismuller', 1.791759469228055),
 ('patric', 1.791759469228055),
 ('uproarious', 1.791759469228055),
 ('nan', 1.791759469228055),
 ('talespin', 1.791759469228055),
 ('shemp', 1.791759469228055),
 ('linden', 1.791759469228055),
 ('mischa', 1.791759469228055),
 ('nicolai', 1.791759469228055),
 ('marcus', 1.791759469228055),
 ('examines', 1.791759469228055),
 ('parisian', 1.791759469228055),
 ('runyon', 1.791759469228055),
 ('witherspoon', 1.791759469228055),
 ('bearer', 1.791759469228055),
 ('champions', 1.791759469228055),
 ('robust', 1.791759469228055),
 ('dedlock', 1.791759469228055),
 ('evans', 1.791759469228055),
 ('britton', 1.791759469228055),
 ('zandalee', 1.791759469228055),
 ('rogen', 1.791759469228055),
 ('lumire', 1.791759469228055),
 ('heightens', 1.791759469228055),
 ('crouse', 1.791759469228055),
 ('travers', 1.791759469228055),
 ('raja', 1.791759469228055),
 ('prem', 1.791759469228055),
 ('trejo', 1.791759469228055),
 ('nath', 1.791759469228055),
 ('massey', 1.791759469228055),
 ('tadashi', 1.791759469228055),
 ('strides', 1.791759469228055),
 ('trotta', 1.791759469228055),
 ('mower', 1.791759469228055),
 ('leisen', 1.791759469228055),
 ('undying', 1.791759469228055),
 ('rory', 1.791759469228055),
 ('chess', 1.7797832781813394),
 ('andrews', 1.7764919970972666),
 ('homer', 1.7692866133759964),
 ('apartheid', 1.7635885922613586),
 ('beautifully', 1.7626953362841438),
 ('foch', 1.7578579175523736),
 ('soccer', 1.7578579175523736),
 ('friendships', 1.7578579175523736),
 ('maclean', 1.7578579175523736),
 ('ira', 1.7491998548092591),
 ('deliciously', 1.7491998548092591),
 ('reginald', 1.7491998548092591),
 ('miners', 1.7491998548092591),
 ('todesking', 1.7491998548092591),
 ('lumet', 1.7462970951512977),
 ('affection', 1.7452394535931621),
 ('cedric', 1.742969305058623),
 ('bittersweet', 1.742969305058623),
 ('elvira', 1.7397031072720019),
 ('carrell', 1.7346010553881064),
 ('silhouette', 1.7346010553881064),
 ('radium', 1.7346010553881064),
 ('custer', 1.7346010553881064),
 ('caprice', 1.7346010553881064),
 ('stepsisters', 1.7346010553881064),
 ('bureaucracy', 1.7346010553881064),
 ('shefali', 1.7346010553881064),
 ('kovacs', 1.7346010553881064),
 ('ilona', 1.7346010553881064),
 ('provo', 1.7346010553881064),
 ('hoon', 1.7346010553881064),
 ('dell', 1.7346010553881064),
 ('ullman', 1.7346010553881064),
 ('axel', 1.7346010553881064),
 ('deft', 1.7346010553881064),
 ('vulcan', 1.7346010553881064),
 ('entranced', 1.7346010553881064),
 ('scorpion', 1.7346010553881064),
 ('kidman', 1.729239112246721),
 ('paperhouse', 1.7227665977411035),
 ('underrated', 1.7197859696029656),
 ('sopranos', 1.7197859696029656),
 ('myrna', 1.7176514970743331),
 ('quintessential', 1.7176514970743331),
 ('gripping', 1.7165360479904674),
 ('superb', 1.7091514458966952),
 ('mastery', 1.7047480922384253),
 ('kibbee', 1.7047480922384253),
 ('borden', 1.7047480922384253),
 ('pension', 1.7047480922384253),
 ('partnership', 1.7047480922384253),
 ('extravagant', 1.7047480922384253),
 ('sternberg', 1.7047480922384253),
 ('montand', 1.7047480922384253),
 ('perceptions', 1.7047480922384253),
 ('minton', 1.7047480922384253),
 ('expansion', 1.7047480922384253),
 ('rail', 1.7047480922384253),
 ('albuquerque', 1.7047480922384253),
 ('coveted', 1.7047480922384253),
 ('celeste', 1.7047480922384253),
 ('lassick', 1.7047480922384253),
 ('apollonia', 1.7047480922384253),
 ('rippner', 1.7047480922384253),
 ('poirot', 1.7047480922384253),
 ('birdie', 1.7047480922384253),
 ('eduardo', 1.7047480922384253),
 ('gorshin', 1.7047480922384253),
 ('friel', 1.7047480922384253),
 ('expressionistic', 1.7047480922384253),
 ('nunsploitation', 1.7047480922384253),
 ('connecticut', 1.7047480922384253),
 ('buttgereit', 1.7047480922384253),
 ('mavens', 1.7047480922384253),
 ('civilized', 1.7047480922384253),
 ('nina', 1.7047480922384253),
 ('rediscovered', 1.7047480922384253),
 ('moonstruck', 1.7047480922384253),
 ('dukakis', 1.7047480922384253),
 ('snare', 1.7047480922384253),
 ('warms', 1.7047480922384253),
 ('gallows', 1.7047480922384253),
 ('doolittle', 1.7047480922384253),
 ('criterion', 1.7047480922384253),
 ('dickinson', 1.7047480922384253),
 ('delon', 1.7047480922384253),
 ('cameroon', 1.7047480922384253),
 ('han', 1.6916760106710724),
 ('ealing', 1.6916760106710724),
 ('paula', 1.6863989535702288),
 ('yoda', 1.6863989535702288),
 ('holm', 1.6863989535702288),
 ('deliverance', 1.6863989535702288),
 ('weaves', 1.6863989535702288),
 ('bagdad', 1.6863989535702288),
 ('determination', 1.6817585740137264),
 ('muller', 1.6739764335716716),
 ('crashers', 1.6739764335716716),
 ('romanticized', 1.6739764335716716),
 ('schmidt', 1.6739764335716716),
 ('petition', 1.6739764335716716),
 ('jerome', 1.6739764335716716),
 ('doodlebops', 1.6739764335716716),
 ('bulldog', 1.6739764335716716),
 ('mvp', 1.6739764335716716),
 ('textile', 1.6739764335716716),
 ('scola', 1.6739764335716716),
 ('tierney', 1.6739764335716716),
 ('janice', 1.6739764335716716),
 ('sceptical', 1.6739764335716716),
 ('krabbe', 1.6739764335716716),
 ('caleb', 1.6739764335716716),
 ('delight', 1.6714733033535532),
 ('welles', 1.6677068205580761),
 ('reeve', 1.6677068205580761),
 ('zelah', 1.6650077635889111),
 ('sadness', 1.663505133704376),
 ('accustomed', 1.6582280766035324),
 ('shia', 1.6582280766035324),
 ('hermann', 1.6582280766035324),
 ('palsy', 1.6582280766035324),
 ('meatball', 1.6582280766035324),
 ('proposes', 1.6582280766035324),
 ('technicolor', 1.65455834771457),
 ('ae', 1.6529230243738393),
 ('nicky', 1.6486586255873816),
 ('soylent', 1.6486586255873816),
 ('restoration', 1.6486586255873816),
 ('tenderness', 1.6486586255873816),
 ('maintained', 1.6486586255873816),
 ('joyous', 1.6486586255873816),
 ('kline', 1.6422277352570913),
 ('sinatra', 1.6389967146756448),
 ('touching', 1.637217476541176),
 ('marisa', 1.634130525024472),
 ('stadium', 1.634130525024472),
 ('gershwin', 1.6314168191528755),
 ('timeless', 1.62924053973028),
 ('macy', 1.6211339521972916),
 ('unforgettable', 1.6177367152487956),
 ('favorites', 1.6158688027643908),
 ('stewart', 1.6119987332957739),
 ('grayson', 1.6094379124341003),
 ('shanks', 1.6094379124341003),
 ('airwolf', 1.6094379124341003),
 ('congrats', 1.6094379124341003),
 ('mammoth', 1.6094379124341003),
 ('henri', 1.6094379124341003),
 ('mammy', 1.6094379124341003),
 ('kabal', 1.6094379124341003),
 ('weber', 1.6094379124341003),
 ('prelude', 1.6094379124341003),
 ('taka', 1.6094379124341003),
 ('cruz', 1.6094379124341003),
 ('cocktails', 1.6094379124341003),
 ('judson', 1.6094379124341003),
 ('blier', 1.6094379124341003),
 ('enforcer', 1.6094379124341003),
 ('roberta', 1.6094379124341003),
 ('pendleton', 1.6094379124341003),
 ('internationally', 1.6094379124341003),
 ('jonny', 1.6094379124341003),
 ('taft', 1.6094379124341003),
 ('funhouse', 1.6094379124341003),
 ('monarchy', 1.6094379124341003),
 ('roshan', 1.6094379124341003),
 ('panda', 1.6094379124341003),
 ('patten', 1.6094379124341003),
 ('restrictive', 1.6094379124341003),
 ('compliments', 1.6094379124341003),
 ('anansa', 1.6094379124341003),
 ('duc', 1.6094379124341003),
 ('florinda', 1.6094379124341003),
 ('franchot', 1.6094379124341003),
 ('hartley', 1.6094379124341003),
 ('candid', 1.6094379124341003),
 ('breakdancing', 1.6094379124341003),
 ('sorbonne', 1.6094379124341003),
 ('noire', 1.6094379124341003),
 ('hoodlums', 1.6094379124341003),
 ('sullivan', 1.6094379124341003),
 ('perceptive', 1.6094379124341003),
 ('serrault', 1.6094379124341003),
 ('bloch', 1.6094379124341003),
 ('extraordinary', 1.6094379124341003),
 ('retriever', 1.6094379124341003),
 ('considerations', 1.6094379124341003),
 ('ringo', 1.6094379124341003),
 ('hahk', 1.6094379124341003),
 ('zenith', 1.6094379124341003),
 ('outstandingly', 1.6094379124341003),
 ('orphaned', 1.6094379124341003),
 ('dahlia', 1.6094379124341003),
 ('ponderosa', 1.6094379124341003),
 ('humanism', 1.6094379124341003),
 ('antidote', 1.6094379124341003),
 ('rugged', 1.6094379124341003),
 ('synthesis', 1.6094379124341003),
 ('lanchester', 1.6094379124341003),
 ('paxinou', 1.6094379124341003),
 ('tsing', 1.6094379124341003),
 ('competitor', 1.6094379124341003),
 ('summertime', 1.6094379124341003),
 ('duets', 1.6094379124341003),
 ('mcgrath', 1.6094379124341003),
 ('repulsion', 1.6094379124341003),
 ('eytan', 1.6094379124341003),
 ('grasshopper', 1.6094379124341003),
 ('everytown', 1.6094379124341003),
 ('hedy', 1.6094379124341003),
 ('priorities', 1.6094379124341003),
 ('kurosawa', 1.6094379124341003),
 ('chico', 1.6094379124341003),
 ('meloni', 1.6094379124341003),
 ('moulin', 1.6094379124341003),
 ('glacier', 1.6094379124341003),
 ('regency', 1.6094379124341003),
 ('advancing', 1.6094379124341003),
 ('complexities', 1.6094379124341003),
 ('unavailable', 1.6094379124341003),
 ('wai', 1.6094379124341003),
 ('nunez', 1.6094379124341003),
 ('brilliantly', 1.5950491749820008),
 ('einstein', 1.5910887737659039),
 ('liu', 1.5910887737659039),
 ('clara', 1.5910887737659039),
 ('dustin', 1.589235205116581),
 ('iran', 1.5841201044498106),
 ('conductor', 1.5841201044498106),
 ('shanghai', 1.5804503755608481),
 ('rainer', 1.575536360758419),
 ('alienate', 1.575536360758419),
 ('mesmerizing', 1.5723966407537513),
 ('raul', 1.5686159179138452),
 ('friendship', 1.5677652160335325),
 ('wonderful', 1.5645425925262093),
 ('sergeants', 1.5581446180465499),
 ('layered', 1.5581446180465499),
 ('corinne', 1.5581446180465499),
 ('seamlessly', 1.5581446180465499),
 ('demme', 1.5581446180465499),
 ('moriarty', 1.5581446180465499),
 ('trading', 1.5581446180465499),
 ...]

In [43]:
# words most frequently seen in a review with a "NEGATIVE" label
list(reversed(pos_neg_ratios.most_common()))[0:30]


Out[43]:
[('nisha', -4.6051701859880918),
 ('slater', -4.6051701859880918),
 ('ramtha', -4.6051701859880918),
 ('eod', -4.6051701859880918),
 ('dunaway', -4.6051701859880918),
 ('weisz', -4.6051701859880918),
 ('mckenna', -4.6051701859880918),
 ('zenia', -4.6051701859880918),
 ('tashan', -4.6051701859880918),
 ('dushku', -4.6051701859880918),
 ('gymkata', -4.6051701859880918),
 ('horrorfest', -4.6051701859880918),
 ('lordi', -4.6051701859880918),
 ('caulfield', -4.6051701859880918),
 ('nepotism', -4.6051701859880918),
 ('swinton', -4.6051701859880918),
 ('schwartzman', -4.6051701859880918),
 ('mraovich', -4.6051701859880918),
 ('carnosaur', -4.6051701859880918),
 ('strummer', -4.6051701859880918),
 ('gammera', -4.6051701859880918),
 ('balding', -4.6051701859880918),
 ('borel', -4.6051701859880918),
 ('richie', -4.6051701859880918),
 ('mohanlal', -4.6051701859880918),
 ('brinke', -4.6051701859880918),
 ('reb', -4.6051701859880918),
 ('pharaoh', -4.6051701859880918),
 ('cato', -4.6051701859880918),
 ('mallory', -4.6051701859880918)]

In [44]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()


Loading BokehJS ...

In [45]:
hist, edges = np.histogram(list(map(lambda x:x[1],pos_neg_ratios.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)



In [46]:
frequency_frequency = Counter()

for word, cnt in total_counts.most_common():
    frequency_frequency[cnt] += 1

In [47]:
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)



In [81]:
import time
import sys
import numpy as np

# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
    def __init__(self, reviews,labels,min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
       
        np.random.seed(1)
    
        self.pre_process_data(reviews, polarity_cutoff, min_count)
        
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
        
    def pre_process_data(self,reviews, polarity_cutoff,min_count):
        
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i].split(" "):
                    positive_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in reviews[i].split(" "):
                    negative_counts[word] += 1
                    total_counts[word] += 1

        pos_neg_ratios = Counter()

        for term,cnt in list(total_counts.most_common()):
            if(cnt >= 50):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        for word,ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                if(total_counts[word] > min_count):
                    if(word in pos_neg_ratios.keys()):
                        if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1,input_nodes))
        self.layer_1 = np.zeros((1,hidden_nodes))
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            self.layer_0[0][self.word2index[word]] = 1

    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def train(self, training_reviews_raw, training_labels):
        
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer

            # Hidden layer
#             layer_1 = self.layer_0.dot(self.weights_0_1)
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            
            # Output layer
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))

            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            if(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
        
    
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        
        # Input Layer


        # Hidden layer
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [82]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)

In [83]:
mlp.train(reviews[:-1000],labels[:-1000])


Progress:99.9% Speed(reviews/sec):1472. #Correct:20461 #Trained:24000 Training Accuracy:85.2%

In [84]:
mlp.test(reviews[-1000:],labels[-1000:])


Progress:99.9% Speed(reviews/sec):2441.% #Correct:859 #Tested:1000 Testing Accuracy:85.9%

In [85]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)

In [86]:
mlp.train(reviews[:-1000],labels[:-1000])


Progress:99.9% Speed(reviews/sec):6868. #Correct:20552 #Trained:24000 Training Accuracy:85.6%

In [87]:
mlp.test(reviews[-1000:],labels[-1000:])


Progress:99.9% Speed(reviews/sec):5910.% #Correct:822 #Tested:1000 Testing Accuracy:82.2%

What's Going On in the Weights?


In [88]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)

In [89]:
mlp_full.train(reviews[:-1000],labels[:-1000])


Progress:99.9% Speed(reviews/sec):1393. #Correct:20335 #Trained:24000 Training Accuracy:84.7%

In [90]:
Image(filename='sentiment_network_sparse.png')


Out[90]:

In [91]:
import matplotlib.colors as colors

In [92]:
words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)

In [102]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#"+colors.rgb2hex([0,min(255,pos_neg_ratios[word] * 1),0])[3:])
        else:
            neg+=1
            colors_list.append("#000000")
#             colors_list.append("#"+colors.rgb2hex([0,0,min(255,pos_neg_ratios[word] * 1)])[3:])

In [103]:
len(vectors_list)


Out[103]:
989

In [104]:
len(colors_list)


Out[104]:
494

In [105]:
# from sklearn.manifold import TSNE
# tsne = TSNE(n_components=2, random_state=0)
# words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [106]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize))

p.scatter(x="x1", y="x2", size=8, source=source,color=colors_list)

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
# p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words



In [ ]: